From 5bd71236c76ac497466602550b1bc9de884fd1b3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 30 Mar 2024 14:09:26 -0400
Subject: [PATCH 01/44] run CI per commit only on inference branch

---
 .github/workflows/gpu-ci.yml | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 48dcda157e..7bdb6805a8 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -1,25 +1,8 @@
 name: "gpu-ci"
 on:
-  pull_request:
-    paths:
-      - "cmake/**"
-      - "config/**"
-      - "deps/**"
-      - "python/**"
-      - "setup.py"
-      - "include/**"
-      - "inference/**"
-      - "src/**"
-      - "tests/inference/**"
-      - "conda/flexflow.yml"
-      - ".github/workflows/gpu-ci.yml"
-      - "tests/cpp_gpu_tests.sh"
-      - "tests/inference_tests.sh"
-      - "tests/training_tests.sh"
-      - "tests/python_interface_test.sh"
   push:
     branches:
-      - "master"
+      - "inference"
     paths:
       - "cmake/**"
       - "config/**"
@@ -194,7 +177,7 @@ jobs:
       
       - name: Save inference output as an artifact
         if: always()
-        run: | 
+        run: |
           cd inference
           tar -zcvf output.tar.gz ./output
 

From e0a6e4fee228ca31a74e69dd84d73e01762214a1 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 30 Mar 2024 14:29:47 -0400
Subject: [PATCH 02/44] fix

---
 python/flexflow/serve/serve.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 14555bfc12..cbc4122897 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -375,7 +375,7 @@ def compile(
         self.rm.set_max_spec_tree_token_num(
             self.model_configs.max_spec_tree_token_num
             if "max_spec_tree_token_num"
-            in self.model_configs.max_spec_tree_token_num.__dict__
+            in self.model_configs.__dict__
             else 20
         )
 

From 1210256080072935fecd71dbf7cbfb31d9f99efa Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <59316330+aetiurf@users.noreply.github.com>
Date: Sat, 6 Apr 2024 22:02:15 +0800
Subject: [PATCH 03/44] fix: 'model_configs' AttributeError (#1358)

---
 python/flexflow/serve/serve.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index cbc4122897..ac622b3337 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -373,9 +373,9 @@ def compile(
         model_configs = self.config_class(self.hf_config)
 
         self.rm.set_max_spec_tree_token_num(
-            self.model_configs.max_spec_tree_token_num
+            model_configs.max_spec_tree_token_num
             if "max_spec_tree_token_num"
-            in self.model_configs.__dict__
+            in model_configs.__dict__
             else 20
         )
 

From b4a639c8990f2d031ee4938f3e7dc8140e4eb324 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 7 Apr 2024 23:26:53 -0400
Subject: [PATCH 04/44] Changes to support Perlmutter environment (#1360)

* .

* remove deadcode

* add benchmarking mode, initializing weights randomly

* better logging when running out of memory

* update

---------

Co-authored-by: Gabriele Oliaro <goliaro@login27.chn.perlmutter.nersc.gov>
---
 cmake/cuda.cmake                              |  15 ++-
 config/config.inc                             |  12 +-
 config/config.linux                           |  14 ++-
 include/flexflow/config.h                     |   2 +-
 inference/incr_decoding/incr_decoding.cc      |   4 +-
 inference/models/falcon.cc                    |  20 ----
 inference/models/llama.cc                     |  10 --
 inference/models/mpt.cc                       |  15 ---
 inference/models/opt.cc                       |  18 ---
 inference/models/starcoder.cc                 |  10 --
 inference/python/incr_decoding.py             |   3 +-
 inference/python/spec_infer.py                |   3 +-
 inference/spec_infer/spec_infer.cc            |   4 +-
 inference/utils/download_hf_model.py          |   4 +-
 python/flexflow/core/__init__.py              |   1 +
 python/flexflow/serve/__init__.py             |   8 ++
 src/mapper/mapper.cc                          |  46 ++++++--
 src/runtime/file_loader.cc                    | 109 ++++++++++--------
 src/runtime/model.cc                          |   8 +-
 .../python_test_configs/generate_configs.py   |   3 +-
 20 files changed, 159 insertions(+), 150 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 68e4ca07b1..45ecc1798b 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -13,8 +13,19 @@ if(CUDA_FOUND)
   # set cuda runtime and driver lib
   # override cublas and curand because the FindCUDA module may not find the correct libs  
   set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT})
-  set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT})
+  if(CUBLAS_PATH)
+    set(CUBLAS_ROOT ${CUBLAS_PATH})
+  else()
+  set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT})
+  if(CURAND_PATH)
+    set(CURAND_ROOT ${CURAND_PATH})
+  else()
+  set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT})
+  
   list(APPEND FLEXFLOW_EXT_LIBRARIES
     ${CUDADRV_LIBRARIES}
     ${CUDA_CUBLAS_LIBRARIES}
diff --git a/config/config.inc b/config/config.inc
index 1121c114c4..7d7b2db9cf 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -62,6 +62,16 @@ if [ -n "$CUDA_DIR" ]; then
   SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
 fi
 
+# set cublas dir
+if [ -n "$CUBLAS_DIR" ]; then
+  SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}"
+fi
+
+# set curand dir
+if [ -n "$CURAND_DIR" ]; then
+  SET_CURAND="-DCURAND_PATH=${CURAND_DIR}"
+fi
+
 # set cudnn dir
 if [ -n "$CUDNN_DIR" ]; then
   SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
@@ -231,7 +241,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
   fi
 fi
 
-CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
+CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
 
 function run_cmake() {
 SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}
diff --git a/config/config.linux b/config/config.linux
index 30edfa7dfe..acffc210f5 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -36,12 +36,18 @@ FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
 # or all available architectures. TODO: support autodetect
 FF_HIP_ARCH=${FF_HIP_ARCH:-"all"}
 
-# set CUDNN dir in case cmake cannot autodetect a path
-CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
-
 # set CUDA dir in case cmake cannot autodetect a path
 CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}
 
+# set CUBLAS dir in case it is not stored in the CUDA DIR
+CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"}
+
+# set CURAND dir in case it is not stored in the CUDA DIR
+CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"}
+
+# set CUDNN dir in case cmake cannot autodetect a path
+CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+
 # if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
 # otherwise, we will build nccl from source
 NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}
@@ -102,7 +108,7 @@ fi
 
 function get_build_configs() {
     # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
+    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 17a3f59e29..2c11ae1131 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -145,7 +145,7 @@ class FFConfig {
   Legion::Runtime *lg_hlr;
   Legion::IndexSpaceT<1> all_gpu_task_is;
   // Legion::FieldSpace field_space;
-  bool syntheticInput, profiling, perform_fusion;
+  bool benchmarking, profiling, perform_fusion;
   bool inference_debugging;
   size_t simulator_work_space_size;
   size_t search_budget;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index f88af3bc43..aae7256ffe 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -107,7 +107,9 @@ void parse_input_args(char **argv,
     }
   }
   if (paths.cache_folder_path.empty()) {
-    paths.cache_folder_path = "~/.cache/flexflow";
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
   }
   // Expand ~ to the home directory if needed
   wordexp_t p;
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index e00f4e9cfd..a529411ddb 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -252,26 +252,6 @@ void FALCON::create_falcon_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            falcon_config.n_head,
-                            falcon_config.n_head_kv,
-                            falcon_config.hidden_size,
-                            falcon_config.hidden_size / falcon_config.n_head,
-                            ff.config.tensor_parallelism_degree);
-  std::cout << "------load weights ----------" << std::endl;
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 14b8c31fa1..517f534438 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -277,16 +277,6 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 7e8fc8358f..70e2b5e9c5 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -259,21 +259,6 @@ void MPT::create_mpt_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            mpt_config.n_heads,
-                            mpt_config.n_heads,
-                            mpt_config.hidden_size,
-                            mpt_config.hidden_size / mpt_config.n_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 3ff4c96fdf..5677d5658e 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -266,24 +266,6 @@ void OPT::create_opt_model(FFModel &ff,
       use_full_precision);
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            opt_config.num_attention_heads,
-                            opt_config.num_attention_heads,
-                            opt_config.hidden_size,
-                            opt_config.hidden_size /
-                                opt_config.num_attention_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------finished loading weights----------" << std::endl;
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 2327c86119..8b0dc1098c 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -232,16 +232,6 @@ void STARCODER::create_starcoder_model(
       ff.config.tensor_parallelism_degree,
       use_full_precision);
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index f7707816c8..05599ea6b9 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -55,6 +55,7 @@ def get_configs():
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "profiling": False,
+            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
@@ -62,7 +63,7 @@ def get_configs():
             # required parameters
             "llm_model": "tiiuae/falcon-7b",
             # optional parameters
-            "cache_path": "",
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
             "full_precision": False,
             "prompt": "",
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index fcb1b8f891..a6dfa8042e 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -55,6 +55,7 @@ def get_configs():
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "profiling": False,
+            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
@@ -62,7 +63,7 @@ def get_configs():
             # required llm arguments
             "llm_model": "meta-llama/Llama-2-7b-hf",
             # optional llm parameters
-            "cache_path": "",
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
             "full_precision": False,
             "ssms": [
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index b6c1e408cd..f7edfd7696 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -124,7 +124,9 @@ void parse_input_args(char **argv,
     }
   }
   if (paths.cache_folder_path.empty()) {
-    paths.cache_folder_path = "~/.cache/flexflow";
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
   }
   // Expand ~ to the home directory if needed
   wordexp_t p;
diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py
index 94a8c23e68..7b4f4d6fb0 100644
--- a/inference/utils/download_hf_model.py
+++ b/inference/utils/download_hf_model.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import flexflow.serve as ff
-import argparse
+import argparse, os
 
 
 def parse_args():
@@ -12,7 +12,7 @@ def parse_args():
         "--cache-folder",
         type=str,
         help="Folder to use to store the model(s) assets in FlexFlow format",
-        default="",
+        default=os.environ.get("FF_CACHE_PATH", ""),
     )
     parser.add_argument(
         "--refresh-cache",
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index d7b1a595d2..2820cf485a 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -41,6 +41,7 @@
     "num_cpus": "-ll:cpu",
     "legion_utility_processors": "-ll:util",
     "profiling": "--profiling",
+    "benchmarking": "--benchmarking",
     "inference_debugging": "--inference-debugging",
     "fusion": "--fusion",
     "disable_control_replication": "--disable-control-replication",
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index cf467280bd..5af077273d 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -45,6 +45,7 @@ def init(
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
     profiling: Optional[bool] = None,
+    benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
     fusion: Optional[bool] = None,
 ):
@@ -72,6 +73,7 @@ def init(
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
+    - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
 
@@ -106,6 +108,8 @@ def init(
     :type use_8bit_quantization: Optional[bool], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
+    :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
+    :type benchmarking: Optional[bool], optional
     :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     :type inference_debugging: Optional[bool], optional
     :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
@@ -132,6 +136,7 @@ def init(
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
             profiling is not None,
+            benchmarking is not None,
             inference_debugging is not None,
             fusion is not None,
         ]
@@ -157,6 +162,7 @@ def init(
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
             "profiling": profiling,
+            "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
             "fusion": fusion,
         }
@@ -201,6 +207,8 @@ def init(
         configs_dict["use_8bit_quantization"] = False
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
+    if configs_dict.get("benchmarking", None) is None:
+        configs_dict["benchmarking"] = False
     if configs_dict.get("inference_debugging", None) is None:
         configs_dict["inference_debugging"] = False
     if configs_dict.get("fusion", None) is None:
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index d7aac4e37c..c293aecb19 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -487,6 +487,25 @@ void FFMapper::premap_task(const MapperContext ctx,
   assert(false);
 }
 
+std::string humanReadableSize(size_t size, bool mb = false) {
+  assert(size >= 0);
+  char const *units[] = {"B", "KiB", "MiB", "GiB", "TiB"};
+  int i = 0;
+  double finalSize = size;
+  if (mb) {
+    finalSize /= 1024 * 1024;
+    i = 2;
+  } else {
+    while (finalSize >= 1024 && i < 4) {
+      finalSize /= 1024;
+      i++;
+    }
+  }
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%.2lf %s", finalSize, units[i]);
+  return std::string(buffer);
+}
+
 void FFMapper::map_task(const MapperContext ctx,
                         Task const &task,
                         MapTaskInput const &input,
@@ -637,16 +656,19 @@ void FFMapper::map_task(const MapperContext ctx,
       }
       // Report failed to creation
       log_ff_mapper.error(
-          "FlexFlow failed allocation of size %zd bytes for "
-          "region requirement %d of task %s (UID %lld) in memory " IDFMT
-          " with kind %d for processor " IDFMT ".",
-          footprint,
+          "Out of memory! FlexFlow failed to reserve block of size %s"
+          " for region requirement %d of task %s (UID %lld) in %s memory (id: "
+          "%llx)"
+          " for processor id: %llx."
+          " Total pre-allocated memory capacity of this kind: %s.",
+          humanReadableSize(footprint).c_str(),
           idx,
           task.get_task_name(),
           task.get_unique_id(),
+          Legion::Mapping::Utilities::to_string(target_mem.kind()),
           target_mem.id,
-          target_mem.kind(),
-          task.target_proc.id);
+          task.target_proc.id,
+          humanReadableSize(target_mem.capacity(), true).c_str());
       assert(false);
     } else {
       output.chosen_instances[idx].push_back(result);
@@ -929,15 +951,17 @@ void FFMapper::map_inline(const MapperContext ctx,
                              created,
                              &footprint)) {
     log_ff_mapper.error(
-        "FlexFlow Mapper failed allocation of size %zd bytes"
+        "Out of memory! FlexFlow failed to reserve block of size %s"
         " for region requirement of inline mapping in task %s (UID %lld)"
-        " in memory " IDFMT "for processor " IDFMT ".",
-        footprint,
+        " in %s memory (id: %llx) for processor id: %llx."
+        " Total pre-allocated memory capacity of this kind: %s.",
+        humanReadableSize(footprint).c_str(),
         inline_op.parent_task->get_task_name(),
         inline_op.parent_task->get_unique_id(),
+        Legion::Mapping::Utilities::to_string(target_memory.kind()),
         target_memory.id,
-        inline_op.parent_task->current_proc.id);
-    printf("target_memory.kind() = %d\n", target_memory.kind());
+        inline_op.parent_task->current_proc.id,
+        humanReadableSize(target_memory.capacity(), true).c_str());
     assert(false);
   } else {
     output.chosen_instances.push_back(result);
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 56558b3185..43ce9d7005 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -725,60 +725,69 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
 
-  if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-      l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
-      l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-    if (weight_filename.find("self_attention") != std::string::npos) {
-      load_attention_weights_multi_query(
-          data, weight_filename, weights_folder, hidden_dim, num_heads);
-    } else if (weight_filename.find("attention") != std::string::npos &&
-               weight_filename.rfind("attention") ==
-                   weight_filename.length() - strlen("attention")) {
-      if (weight_idx == 0) {
-        load_attention_weights_v2(data,
-                                  num_heads,
-                                  num_kv_heads,
-                                  hidden_dim,
-                                  qkv_inner_dim,
-                                  weight_filename,
-                                  weights_folder,
-                                  volume,
-                                  tensor_parallelism_degree);
+  if (ff->config.benchmarking) {
+    std::cout << "Initializing weight " << weight_filename
+              << " with random data (benchmarking mode)" << std::endl;
+    // If benchmarking, we don't need to load the weights
+    // We can just fill the weight tensor with random data
+  } else {
+    if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+        l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
+        l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
+      if (weight_filename.find("self_attention") != std::string::npos) {
+        load_attention_weights_multi_query(
+            data, weight_filename, weights_folder, hidden_dim, num_heads);
+      } else if (weight_filename.find("attention") != std::string::npos &&
+                 weight_filename.rfind("attention") ==
+                     weight_filename.length() - strlen("attention")) {
+        if (weight_idx == 0) {
+          load_attention_weights_v2(data,
+                                    num_heads,
+                                    num_kv_heads,
+                                    hidden_dim,
+                                    qkv_inner_dim,
+                                    weight_filename,
+                                    weights_folder,
+                                    volume,
+                                    tensor_parallelism_degree);
+        } else {
+          long long value;
+          l->get_int_property("final_bias", value);
+          bool final_bias = (bool)value;
+          load_attention_bias_v2(data,
+                                 num_heads,
+                                 num_kv_heads,
+                                 hidden_dim,
+                                 qkv_inner_dim,
+                                 final_bias,
+                                 weight_filename,
+                                 weights_folder);
+        }
+
       } else {
-        long long value;
-        l->get_int_property("final_bias", value);
-        bool final_bias = (bool)value;
-        load_attention_bias_v2(data,
-                               num_heads,
-                               num_kv_heads,
-                               hidden_dim,
-                               qkv_inner_dim,
-                               final_bias,
-                               weight_filename,
-                               weights_folder);
+        assert(false);
       }
-
+    } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
+      assert(weight_idx >= 0 || weight_idx <= 2);
+      weight_filename += (weight_idx == 0)
+                             ? "_attn_bias"
+                             : ((weight_idx == 1) ? "_weight" : "_bias");
+      std::cout << "Loading weight file " << weight_filename << std::endl;
+      std::string weight_filepath =
+          join_path({weights_folder, weight_filename});
+      load_from_file(data, volume, weight_filepath);
     } else {
-      assert(false);
-    }
-  } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
-    assert(weight_idx >= 0 || weight_idx <= 2);
-    weight_filename += (weight_idx == 0)
-                           ? "_attn_bias"
-                           : ((weight_idx == 1) ? "_weight" : "_bias");
-    std::cout << "Loading weight file " << weight_filename << std::endl;
-    std::string weight_filepath = join_path({weights_folder, weight_filename});
-    load_from_file(data, volume, weight_filepath);
-  } else {
-    // default op
-    assert(weight_idx == 0 || weight_idx == 1);
-    // handle exception
-    if (weight_filename != "embed_tokens_weight_lm_head") {
-      weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+      // default op
+      assert(weight_idx == 0 || weight_idx == 1);
+      // handle exception
+      if (weight_filename != "embed_tokens_weight_lm_head") {
+        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+      }
+      std::cout << "Loading weight file " << weight_filename << std::endl;
+      std::string weight_filepath =
+          join_path({weights_folder, weight_filename});
+      load_from_file(data, volume, weight_filepath);
     }
-    std::cout << "Loading weight file " << weight_filename << std::endl;
-    std::string weight_filepath = join_path({weights_folder, weight_filename});
-    load_from_file(data, volume, weight_filepath);
   }
 
   // Copy the weight data from the buffer to the weight's ParallelTensor
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 40f758282c..1fa281777a 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4065,6 +4065,7 @@ struct DefaultConfig {
   // const static int iterations = 1;
   const static int batchSize = 64;
   const static bool profiling = false;
+  const static bool benchmarking = false;
   const static bool inference_debugging = false;
   constexpr static float learningRate = 0.01f;
   constexpr static float weightDecay = 0.0001f;
@@ -4100,6 +4101,7 @@ FFConfig::FFConfig() {
   // iterations = DefaultConfig::iterations;
   batchSize = DefaultConfig::batchSize;
   profiling = DefaultConfig::profiling;
+  benchmarking = DefaultConfig::benchmarking;
   inference_debugging = DefaultConfig::inference_debugging;
   learningRate = DefaultConfig::learningRate;
   weightDecay = DefaultConfig::weightDecay;
@@ -4137,7 +4139,7 @@ FFConfig::FFConfig() {
   export_strategy_computation_graph_file = "";
   dataset_path = "";
   substitution_json_path = tl::nullopt;
-  syntheticInput = false;
+  benchmarking = false;
   perform_fusion = false;
   base_optimize_threshold = DefaultConfig::base_optimize_threshold;
   perform_memory_search = false;
@@ -4290,6 +4292,10 @@ void FFConfig::parse_args(char **argv, int argc) {
       profiling = true;
       continue;
     }
+    if (!strcmp(argv[i], "--benchmarking")) {
+      benchmarking = true;
+      continue;
+    }
     if (!strcmp(argv[i], "--inference-debugging")) {
       inference_debugging = true;
       continue;
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index ebaadade32..41703cf431 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -18,6 +18,7 @@
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
     "profiling": False,
+    "benchmarking": False,
     "inference_debugging": False,
     "fusion": True,
 }
@@ -25,7 +26,7 @@
     # required parameters
     "llm_model": "tiiuae/falcon-7b",
     # optional parameters
-    "cache_path": "",
+    "cache_path": os.environ.get("FF_CACHE_PATH", ""),
     "refresh_cache": False,
     "full_precision": True,
     "prompt": "",

From 7da197e71e31a1840d9404a63d5a9fdd20d4d41e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 23 Apr 2024 20:26:33 -0400
Subject: [PATCH 05/44] update workflow to build rocm docker images

---
 .github/workflows/docker-build.yml | 58 +++++++++++++-----------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 54805cc325..d16179434b 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -20,26 +20,22 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  oracle-runner-start:
-    name: Start an Oracle instance to build the ROCM Docker images
+  rocm-builder-start:
+    name: Start an AWS instance to build the ROCM Docker images
     runs-on: ubuntu-latest
     if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     env: 
-      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
-      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
-      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
-      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
-      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
-      OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }}
+      ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
     steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
 
-      - name: Install Oracle Cloud Infrastructure library
-        run: pip install oci
-      
-      - name: Start Oracle Machine
-        run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID
+      - name: Start EC2 instance
+        run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
 
   docker-build-rocm:
     name: Build and Install FlexFlow in a Docker Container (ROCm backend)
@@ -66,8 +62,8 @@ jobs:
 
   docker-build-and-publish-rocm:
     name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
-    needs: oracle-runner-start
-    runs-on: [self-hosted, cpu_only]
+    needs: rocm-builder-start
+    runs-on: [self-hosted, rocm_builder]
     if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     strategy:
       matrix:
@@ -148,27 +144,23 @@ jobs:
           ./docker/publish.sh flexflow-environment
           ./docker/publish.sh flexflow
 
-  oracle-runner-stop:
+  rocm-builder-stop:
     needs: docker-build-and-publish-rocm
     if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     runs-on: ubuntu-latest
-    name: Stop the Oracle instance we used to build the ROCM Docker images
+    name: Stop the AWS instance we used to build the ROCM Docker images
     env:
-      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
-      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
-      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
-      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
-      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
-      OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }}
+      ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
     steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-      
-      - name: Install Oracle Cloud Infrastructure library
-        run: pip install oci
-        
-      - name: Stop Oracle Machine
-        run: python3 .github/workflows/helpers/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: Start EC2 instance
+        run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
 
   notify-slack:
     name: Notify Slack in case of failure

From 002fdf017c7dd665b703da37494093161c3d55c7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 23 Apr 2024 22:35:42 -0400
Subject: [PATCH 06/44] downgrade to python 3.11 for now

---
 docker/flexflow-environment/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index edbf9a7e52..6ca337f58d 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -17,7 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut
 
 # Install Python3 with Miniconda
 ARG python_version "latest"
-RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
+#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
+RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
     if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \
         echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \
         exit 1; \

From d54e4b6a747f3940a19989a56095a71540e4c0d8 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com>
Date: Wed, 1 May 2024 01:51:57 +0800
Subject: [PATCH 07/44] doc: fix c++ serving example (#1372)

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
---
 .github/README.md | 2 +-
 SERVE.md          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/README.md b/.github/README.md
index 4a2a881c8d..c4f6baada6 100644
--- a/.github/README.md
+++ b/.github/README.md
@@ -178,7 +178,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 
diff --git a/SERVE.md b/SERVE.md
index e9bab3d702..9472d50a62 100644
--- a/SERVE.md
+++ b/SERVE.md
@@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 

From b90771a376fddbddf09af3f23e4ecae57911438e Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Thu, 30 May 2024 14:24:42 -0700
Subject: [PATCH 08/44] Update README.md

---
 .github/README.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/README.md b/.github/README.md
index c4f6baada6..5aba2295d5 100644
--- a/.github/README.md
+++ b/.github/README.md
@@ -4,12 +4,6 @@
 
 ---
 
-## News🔥:
-
-* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6
-* [08/16/2023] Adding Starcoder model support
-* [08/14/2023] Released Docker images for different CUDA versions
-
 ## What is FlexFlow Serve
   
 The high computational and memory requirements of generative large language

From 385c118447a8b1451de3641c8ecf437245b9248b Mon Sep 17 00:00:00 2001
From: FelixBrakel <felixbrakel@quicknet.nl>
Date: Thu, 30 May 2024 23:39:10 +0200
Subject: [PATCH 09/44] Add examples for every layer in the python layer API
 (#1297)

* Fix incorrect innode being checked

* Add example for every layer on the FFModel python class

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
Co-authored-by: Zhihao Jia <zhihao@cmu.edu>
---
 docs/source/python/layers.rst                 |   2 +-
 examples/python/native/ops/add.py             |  45 ++++++++
 .../ops/add_bias_residual_layer_norm.py       |  78 +++++++++++++
 examples/python/native/ops/arg_top_k.py       |  61 ++++++++++
 examples/python/native/ops/argmax.py          |  55 +++++++++
 examples/python/native/ops/batch_matmul.py    |   0
 examples/python/native/ops/batch_norm.py      |  36 ++++++
 examples/python/native/ops/beam_top_k.py      |  58 ++++++++++
 examples/python/native/ops/concat.py          |  43 +++++++
 examples/python/native/ops/conv2d.py          |  45 ++++++++
 examples/python/native/ops/cos.py             |  44 +++++++
 examples/python/native/ops/dense.py           |  38 +++++++
 examples/python/native/ops/divide.py          |  48 ++++++++
 examples/python/native/ops/dropout.py         |  49 ++++++++
 examples/python/native/ops/elu.py             |  47 ++++++++
 examples/python/native/ops/embedding.py       |  39 +++++++
 examples/python/native/ops/exp.py             |   0
 examples/python/native/ops/flat.py            |   0
 examples/python/native/ops/gather.py          |  60 ++++++++++
 examples/python/native/ops/gelu.py            |  51 +++++++++
 examples/python/native/ops/identity.py        |  49 ++++++++
 .../ops/inc_multihead_self_attention.py       | 103 +++++++++++++++++
 .../inc_multihead_self_attention_verify.py    | 103 +++++++++++++++++
 .../ops/inc_multiquery_self_attention.py      | 107 ++++++++++++++++++
 .../inc_multiquery_self_attention_verify.py   | 107 ++++++++++++++++++
 examples/python/native/ops/layer_norm.py      |  48 ++++++++
 examples/python/native/ops/max.py             |  54 +++++++++
 examples/python/native/ops/mean.py            |  48 ++++++++
 examples/python/native/ops/min.py             |  54 +++++++++
 .../python/native/ops/multihead_attention.py  |   0
 examples/python/native/ops/multiply.py        |  45 ++++++++
 examples/python/native/ops/pool2d.py          |  36 ++++++
 examples/python/native/ops/pow.py             |  46 ++++++++
 examples/python/native/ops/reduce_sum.py      |  48 ++++++++
 examples/python/native/ops/relu.py            |  46 ++++++++
 examples/python/native/ops/reshape.py         |  41 +++++++
 .../python/native/ops/residual_layer_norm.py  |  93 +++++++++++++++
 .../python/native/ops/residual_rms_norm.py    |  80 +++++++++++++
 examples/python/native/ops/reverse.py         |  37 ++++++
 examples/python/native/ops/rms_norm.py        |  64 +++++++++++
 examples/python/native/ops/rsqrt.py           |  44 +++++++
 examples/python/native/ops/sampling.py        |  55 +++++++++
 examples/python/native/ops/scalar_add.py      |  53 +++++++++
 examples/python/native/ops/scalar_multiply.py |  53 +++++++++
 examples/python/native/ops/scalar_sub.py      |  53 +++++++++
 .../python/native/ops/scalar_true_divide.py   |  53 +++++++++
 examples/python/native/ops/sigmoid.py         |  46 ++++++++
 .../python/native/ops/sigmoid_silu_multi.py   |  58 ++++++++++
 examples/python/native/ops/sin.py             |  44 +++++++
 examples/python/native/ops/softmax.py         |  46 ++++++++
 .../ops/spec_inc_multihead_self_attention.py  | 103 +++++++++++++++++
 .../ops/spec_inc_multiquery_self_attention.py | 107 ++++++++++++++++++
 examples/python/native/ops/split.py           |  47 ++++++++
 examples/python/native/ops/subtract.py        |  45 ++++++++
 examples/python/native/ops/tanh.py            |  46 ++++++++
 examples/python/native/ops/transpose.py       |  38 +++++++
 56 files changed, 2898 insertions(+), 1 deletion(-)
 create mode 100644 examples/python/native/ops/add.py
 create mode 100644 examples/python/native/ops/add_bias_residual_layer_norm.py
 create mode 100644 examples/python/native/ops/arg_top_k.py
 create mode 100644 examples/python/native/ops/argmax.py
 create mode 100644 examples/python/native/ops/batch_matmul.py
 create mode 100644 examples/python/native/ops/batch_norm.py
 create mode 100644 examples/python/native/ops/beam_top_k.py
 create mode 100644 examples/python/native/ops/concat.py
 create mode 100644 examples/python/native/ops/conv2d.py
 create mode 100644 examples/python/native/ops/cos.py
 create mode 100644 examples/python/native/ops/dense.py
 create mode 100644 examples/python/native/ops/divide.py
 create mode 100644 examples/python/native/ops/dropout.py
 create mode 100644 examples/python/native/ops/elu.py
 create mode 100644 examples/python/native/ops/embedding.py
 create mode 100644 examples/python/native/ops/exp.py
 create mode 100644 examples/python/native/ops/flat.py
 create mode 100644 examples/python/native/ops/gather.py
 create mode 100644 examples/python/native/ops/gelu.py
 create mode 100644 examples/python/native/ops/identity.py
 create mode 100644 examples/python/native/ops/inc_multihead_self_attention.py
 create mode 100644 examples/python/native/ops/inc_multihead_self_attention_verify.py
 create mode 100644 examples/python/native/ops/inc_multiquery_self_attention.py
 create mode 100644 examples/python/native/ops/inc_multiquery_self_attention_verify.py
 create mode 100644 examples/python/native/ops/layer_norm.py
 create mode 100644 examples/python/native/ops/max.py
 create mode 100644 examples/python/native/ops/mean.py
 create mode 100644 examples/python/native/ops/min.py
 create mode 100644 examples/python/native/ops/multihead_attention.py
 create mode 100644 examples/python/native/ops/multiply.py
 create mode 100644 examples/python/native/ops/pool2d.py
 create mode 100644 examples/python/native/ops/pow.py
 create mode 100644 examples/python/native/ops/reduce_sum.py
 create mode 100644 examples/python/native/ops/relu.py
 create mode 100644 examples/python/native/ops/reshape.py
 create mode 100644 examples/python/native/ops/residual_layer_norm.py
 create mode 100644 examples/python/native/ops/residual_rms_norm.py
 create mode 100644 examples/python/native/ops/reverse.py
 create mode 100644 examples/python/native/ops/rms_norm.py
 create mode 100644 examples/python/native/ops/rsqrt.py
 create mode 100644 examples/python/native/ops/sampling.py
 create mode 100644 examples/python/native/ops/scalar_add.py
 create mode 100644 examples/python/native/ops/scalar_multiply.py
 create mode 100644 examples/python/native/ops/scalar_sub.py
 create mode 100644 examples/python/native/ops/scalar_true_divide.py
 create mode 100644 examples/python/native/ops/sigmoid.py
 create mode 100644 examples/python/native/ops/sigmoid_silu_multi.py
 create mode 100644 examples/python/native/ops/sin.py
 create mode 100644 examples/python/native/ops/softmax.py
 create mode 100644 examples/python/native/ops/spec_inc_multihead_self_attention.py
 create mode 100644 examples/python/native/ops/spec_inc_multiquery_self_attention.py
 create mode 100644 examples/python/native/ops/split.py
 create mode 100644 examples/python/native/ops/subtract.py
 create mode 100644 examples/python/native/ops/tanh.py
 create mode 100644 examples/python/native/ops/transpose.py

diff --git a/docs/source/python/layers.rst b/docs/source/python/layers.rst
index 91f12094e6..1be91a8b17 100644
--- a/docs/source/python/layers.rst
+++ b/docs/source/python/layers.rst
@@ -3,7 +3,7 @@ Layers API
 **********
 
 Layers are the basic building blocks of neural networks in FlexFlow. The inputs of a layer consists of a tensor or a list of tensors and some state variables,
-and the outputs of a layer is a tensor or a list of tensors.
+and the outputs of a layer is a tensor or a list of tensors. See https://github.com/flexflow/FlexFlow/examples/python/native/ops for an example for every layer
 
 .. automodule:: flexflow.core.flexflow_cffi
    :noindex:
diff --git a/examples/python/native/ops/add.py b/examples/python/native/ops/add.py
new file mode 100644
index 0000000000..50b9d16fd0
--- /dev/null
+++ b/examples/python/native/ops/add.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'add' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_add(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.add(input_tensor1, input_tensor2)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    _ = test_add(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/add_bias_residual_layer_norm.py b/examples/python/native/ops/add_bias_residual_layer_norm.py
new file mode 100644
index 0000000000..6e8dffbc9e
--- /dev/null
+++ b/examples/python/native/ops/add_bias_residual_layer_norm.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_add_bias_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+    residual_tensor = ffmodel.create_tensor(residual_arr.shape, DataType.DT_FLOAT)
+
+    output_tensor, layer_norm_output = ffmodel.add_bias_residual_layer_norm(
+        input_tensor,
+        residual_tensor,
+        axes=axes,
+        elementwise_affine=elementwise_affine,
+        eps=eps,
+        use_bias=use_bias,
+        name="add_bias_residual_layer_norm_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+    dataloader_residual = ffmodel.create_data_loader(residual_tensor, residual_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_residual.reset()
+
+    dataloader_input.next_batch(ffmodel)
+    dataloader_residual.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    output_tensor.inline_map(ffmodel, ffconfig)
+    layer_norm_output.inline_map(ffmodel, ffconfig)
+    output_result = output_tensor.get_array(ffmodel, ffconfig)
+    layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+    return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    residual_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    axes_to_normalize = [1, 2]  # Example axes to normalize
+
+    output_result, layer_norm_result = test_add_bias_residual_layer_norm(
+        ffconfig,
+        input_data,
+        residual_data,
+        axes=axes_to_normalize,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nResidual Array:")
+    print(residual_data)
+    print(f"\nOutput Array after applying add_bias_residual_layer_norm along axes {axes_to_normalize}:")
+    print(output_result)
+    print("\nLayer Norm Result:")
+    print(layer_norm_result)
diff --git a/examples/python/native/ops/arg_top_k.py b/examples/python/native/ops/arg_top_k.py
new file mode 100644
index 0000000000..79edc5dfad
--- /dev/null
+++ b/examples/python/native/ops/arg_top_k.py
@@ -0,0 +1,61 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_arg_top_k(ffconfig, input_arr: np.ndarray, k: int, sorted: bool, speculative_decoding: bool, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    arg_top_k_output = ffmodel.arg_top_k(
+        input_tensor,
+        k,
+        sorted,
+        speculative_decoding,
+        name="arg_top_k_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+        metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    arg_top_k_output.inline_map(ffmodel, ffconfig)
+    output_result = arg_top_k_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+    k_value = 5
+    sorted_value = True
+    speculative_decoding_value = False  # Example value for speculative_decoding
+
+    output_result = test_arg_top_k(
+        ffconfig,
+        input_data,
+        k=k_value,
+        sorted=sorted_value,
+        speculative_decoding=speculative_decoding_value,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying arg_top_k:")
+    print(output_result)
diff --git a/examples/python/native/ops/argmax.py b/examples/python/native/ops/argmax.py
new file mode 100644
index 0000000000..dda0e6b0bc
--- /dev/null
+++ b/examples/python/native/ops/argmax.py
@@ -0,0 +1,55 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_argmax(ffconfig, input_arr: np.ndarray, beam_search: bool, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    argmax_output = ffmodel.argmax(
+        input_tensor,
+        beam_search,
+        name="argmax_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    argmax_output.inline_map(ffmodel, ffconfig)
+    output_result = argmax_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+    beam_search_value = True  # Set to True or False based on your requirement
+
+    output_result = test_argmax(
+        ffconfig,
+        input_data,
+        beam_search=beam_search_value,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying argmax:")
+    print(output_result)
diff --git a/examples/python/native/ops/batch_matmul.py b/examples/python/native/ops/batch_matmul.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/batch_norm.py b/examples/python/native/ops/batch_norm.py
new file mode 100644
index 0000000000..b243e79d37
--- /dev/null
+++ b/examples/python/native/ops/batch_norm.py
@@ -0,0 +1,36 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def ff(ffconfig, input_arr: np.ndarray):
+    ffmodel = FFModel(ffconfig)
+    # TODO: convert input to ff tensor
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.batch_norm(
+        input_tensor
+    )
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    _ = ff(ffconfig, input)
diff --git a/examples/python/native/ops/beam_top_k.py b/examples/python/native/ops/beam_top_k.py
new file mode 100644
index 0000000000..cb2fdfb3d2
--- /dev/null
+++ b/examples/python/native/ops/beam_top_k.py
@@ -0,0 +1,58 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_beam_top_k(ffconfig, input_arr: np.ndarray, max_beam_size: int, sorted: bool, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    beam_top_k_output = ffmodel.beam_top_k(
+        input_tensor,
+        max_beam_size,
+        sorted,
+        name="beam_top_k_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    beam_top_k_output.inline_map(ffmodel, ffconfig)
+    output_result = beam_top_k_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+    max_beam_size_value = 3
+    sorted_value = True
+
+    output_result = test_beam_top_k(
+        ffconfig,
+        input_data,
+        max_beam_size=max_beam_size_value,
+        sorted=sorted_value,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying beam_top_k:")
+    print(output_result)
diff --git a/examples/python/native/ops/concat.py b/examples/python/native/ops/concat.py
new file mode 100644
index 0000000000..0088d7b848
--- /dev/null
+++ b/examples/python/native/ops/concat.py
@@ -0,0 +1,43 @@
+# The basis for this test of the 'concatenate' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_concatenate(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.concat([input_tensor1, input_tensor2], axis=1)
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    _ = test_concatenate(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/conv2d.py b/examples/python/native/ops/conv2d.py
new file mode 100644
index 0000000000..02b3646aaa
--- /dev/null
+++ b/examples/python/native/ops/conv2d.py
@@ -0,0 +1,45 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def ff(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.conv2d(
+        input_tensor,
+        32,
+        3,
+        3,
+        1,
+        1,
+        1,
+        1,
+        use_bias=False
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    _ = ff(ffconfig, input)
diff --git a/examples/python/native/ops/cos.py b/examples/python/native/ops/cos.py
new file mode 100644
index 0000000000..26f6307685
--- /dev/null
+++ b/examples/python/native/ops/cos.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_cos(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    cos_output = ffmodel.cos(input_tensor, name="cos_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    cos_output.inline_map(ffmodel, ffconfig)
+    cos_result = cos_output.get_array(ffmodel, ffconfig)
+
+    return cos_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    cos_result = test_cos(ffconfig, input_data)
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying cos function:")
+    print(cos_result)
diff --git a/examples/python/native/ops/dense.py b/examples/python/native/ops/dense.py
new file mode 100644
index 0000000000..ec0a3dc65b
--- /dev/null
+++ b/examples/python/native/ops/dense.py
@@ -0,0 +1,38 @@
+# The basis for this test of the 'dense' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_dense(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.dense(input_tensor, 64, activation=ActiMode.AC_MODE_RELU)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+    _ = test_dense(ffconfig, input)
diff --git a/examples/python/native/ops/divide.py b/examples/python/native/ops/divide.py
new file mode 100644
index 0000000000..419bf714ab
--- /dev/null
+++ b/examples/python/native/ops/divide.py
@@ -0,0 +1,48 @@
+# The basis for this test of the 'divide' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_divide(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.divide(input_tensor1, input_tensor2)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    # Avoid division by zero in input2
+    input2 = np.where(input2 == 0, 1e-6, input2)
+
+    _ = test_divide(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/dropout.py b/examples/python/native/ops/dropout.py
new file mode 100644
index 0000000000..3aa44a5a5b
--- /dev/null
+++ b/examples/python/native/ops/dropout.py
@@ -0,0 +1,49 @@
+# The basis for this test of the 'Dropout' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_dropout(ffconfig, input_arr: np.ndarray, dropout_rate: float = 0.5) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    # Apply Dropout layer
+    out = ffmodel.dropout(input_tensor, dropout_rate, 0)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    # You can adjust the dropout rate as needed
+    dropout_rate_param = 0.5
+
+    result = test_dropout(ffconfig, input_data, dropout_rate_param)
+
+    print("Input Data:")
+    print(input_data)
+
+    print("\nResult after Dropout layer:")
+    print(result)
diff --git a/examples/python/native/ops/elu.py b/examples/python/native/ops/elu.py
new file mode 100644
index 0000000000..7a6ef1f621
--- /dev/null
+++ b/examples/python/native/ops/elu.py
@@ -0,0 +1,47 @@
+# The basis for this test of the 'ELU' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_elu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    # Apply ELU activation
+    out = ffmodel.elu(input_tensor)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    result = test_elu(ffconfig, input_data)
+
+    print("Input Data:")
+    print(input_data)
+
+    print("\nResult after ELU activation:")
+    print(result)
diff --git a/examples/python/native/ops/embedding.py b/examples/python/native/ops/embedding.py
new file mode 100644
index 0000000000..34bced3798
--- /dev/null
+++ b/examples/python/native/ops/embedding.py
@@ -0,0 +1,39 @@
+# The basis for this test of the 'embedding' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_embedding(ffconfig, input_arr: np.ndarray, vocab_size: int, embedding_dim: int) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_INT32)
+
+    out = ffmodel.embedding(input_tensor, vocab_size, embedding_dim, AggrMode.AGGR_MODE_SUM)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    vocab_size = 1000
+    embedding_dim = 50
+    input = np.random.randint(low=0, high=vocab_size, size=(ffconfig.batch_size, 10), dtype=np.int32)
+    _ = test_embedding(ffconfig, input, vocab_size, embedding_dim)
diff --git a/examples/python/native/ops/exp.py b/examples/python/native/ops/exp.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/flat.py b/examples/python/native/ops/flat.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/gather.py b/examples/python/native/ops/gather.py
new file mode 100644
index 0000000000..e13b6e4c75
--- /dev/null
+++ b/examples/python/native/ops/gather.py
@@ -0,0 +1,60 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_gather(ffconfig, input_arr: np.ndarray, index_arr: np.ndarray, dim: int, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+    index_tensor = ffmodel.create_tensor(index_arr.shape, DataType.DT_INT32)
+
+    gather_output = ffmodel.gather(
+        input_tensor,
+        index_tensor,
+        dim,
+        name="gather_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+    dataloader_index = ffmodel.create_data_loader(index_tensor, index_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_index.reset()
+
+    dataloader_input.next_batch(ffmodel)
+    dataloader_index.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    gather_output.inline_map(ffmodel, ffconfig)
+    output_result = gather_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    index_data = np.random.randint(0, 5, size=(ffconfig.batch_size,)).astype(np.int32)
+    dim_to_gather = 2  # Example dimension to gather along
+
+    output_result = test_gather(ffconfig, input_data, index_data, dim=dim_to_gather)
+
+    print("Input Array:")
+    print(input_data)
+    print("\nIndex Array:")
+    print(index_data)
+    print(f"\nOutput Array after applying gather along dimension {dim_to_gather}:")
+    print(output_result)
diff --git a/examples/python/native/ops/gelu.py b/examples/python/native/ops/gelu.py
new file mode 100644
index 0000000000..84fabd36e1
--- /dev/null
+++ b/examples/python/native/ops/gelu.py
@@ -0,0 +1,51 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_gelu(ffconfig, input_arr: np.ndarray, inplace: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    gelu_output = ffmodel.gelu(
+        input_tensor,
+        inplace=inplace,
+        name="gelu_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    gelu_output.inline_map(ffmodel, ffconfig)
+    output_result = gelu_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    inplace_flag = True  # Example inplace flag
+
+    output_result = test_gelu(ffconfig, input_data, inplace=inplace_flag)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying gelu activation function (inplace={inplace_flag}):")
+    print(output_result)
diff --git a/examples/python/native/ops/identity.py b/examples/python/native/ops/identity.py
new file mode 100644
index 0000000000..fbf63e717c
--- /dev/null
+++ b/examples/python/native/ops/identity.py
@@ -0,0 +1,49 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_identity(ffconfig, input_arr: np.ndarray, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    identity_output = ffmodel.identity(
+        input_tensor,
+        name="identity_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    identity_output.inline_map(ffmodel, ffconfig)
+    output_result = identity_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    output_result = test_identity(ffconfig, input_data)
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying identity function:")
+    print(output_result)
diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py
new file mode 100644
index 0000000000..dce7bd565d
--- /dev/null
+++ b/examples/python/native/ops/inc_multihead_self_attention.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multihead_self_attention(
+        ffconfig,
+        input_arr: np.ndarray,
+        embed_dim: int,
+        num_heads: int,
+        kdim: int = 0,
+        vdim: int = 0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        data_type: DataType = DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding: bool = False,
+        scaling_query: bool = False,
+        scaling_factor: float = 1.0,
+        qk_prod_scaling: bool = True,
+        position_bias: bool = False,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+    inc_multihead_self_attention_output = ffmodel.inc_multihead_self_attention(
+        input_tensor,
+        embed_dim,
+        num_heads,
+        kdim=kdim,
+        vdim=vdim,
+        dropout=dropout,
+        bias=bias,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+        data_type=data_type,
+        kernel_initializer=kernel_initializer,
+        apply_rotary_embedding=apply_rotary_embedding,
+        scaling_query=scaling_query,
+        scaling_factor=scaling_factor,
+        qk_prod_scaling=qk_prod_scaling,
+        position_bias=position_bias,
+        name="inc_multihead_self_attention_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig)
+    output_result = inc_multihead_self_attention_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    embed_dim_value = 64
+    num_heads_value = 8
+
+    output_result = test_inc_multihead_self_attention(
+        ffconfig,
+        input_data,
+        embed_dim=embed_dim_value,
+        num_heads=num_heads_value,
+        kdim=0,  # Example value for kdim
+        vdim=0,  # Example value for vdim
+        dropout=0.1,  # Example value for dropout
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_FLOAT,
+        kernel_initializer=None,  # Example value for kernel_initializer
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying inc_multihead_self_attention:")
+    print(output_result)
diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py
new file mode 100644
index 0000000000..f6dc8e3933
--- /dev/null
+++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multihead_self_attention_verify(
+        ffconfig,
+        input_arr: np.ndarray,
+        embed_dim: int,
+        num_heads: int,
+        kdim: int = 0,
+        vdim: int = 0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        data_type: DataType = DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding: bool = False,
+        scaling_query: bool = False,
+        scaling_factor: float = 1.0,
+        qk_prod_scaling: bool = True,
+        position_bias: bool = False,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+    inc_multihead_self_attention_verify_output = ffmodel.inc_multihead_self_attention_verify(
+        input_tensor,
+        embed_dim,
+        num_heads,
+        kdim=kdim,
+        vdim=vdim,
+        dropout=dropout,
+        bias=bias,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+        data_type=data_type,
+        kernel_initializer=kernel_initializer,
+        apply_rotary_embedding=apply_rotary_embedding,
+        scaling_query=scaling_query,
+        scaling_factor=scaling_factor,
+        qk_prod_scaling=qk_prod_scaling,
+        position_bias=position_bias,
+        name="inc_multihead_self_attention_verify_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    inc_multihead_self_attention_verify_output.inline_map(ffmodel, ffconfig)
+    output_result = inc_multihead_self_attention_verify_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    embed_dim_value = 64
+    num_heads_value = 8
+
+    output_result = test_inc_multihead_self_attention_verify(
+        ffconfig,
+        input_data,
+        embed_dim=embed_dim_value,
+        num_heads=num_heads_value,
+        kdim=0,  # Example value for kdim
+        vdim=0,  # Example value for vdim
+        dropout=0.1,  # Example value for dropout
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_FLOAT,
+        kernel_initializer=None,  # Example value for kernel_initializer
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying inc_multihead_self_attention_verify:")
+    print(output_result)
diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py
new file mode 100644
index 0000000000..33390ab1f6
--- /dev/null
+++ b/examples/python/native/ops/inc_multiquery_self_attention.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multiquery_self_attention(
+        ffconfig,
+        input_arr: np.ndarray,
+        embed_dim: int,
+        num_q_heads: int,
+        num_kv_heads: int,
+        kdim: int = 0,
+        vdim: int = 0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        data_type: DataType = DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding: bool = False,
+        scaling_query: bool = False,
+        scaling_factor: float = 1.0,
+        qk_prod_scaling: bool = True,
+        position_bias: bool = False,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+    inc_multiquery_self_attention_output = ffmodel.inc_multiquery_self_attention(
+        input_tensor,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=kdim,
+        vdim=vdim,
+        dropout=dropout,
+        bias=bias,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+        data_type=data_type,
+        kernel_initializer=kernel_initializer,
+        apply_rotary_embedding=apply_rotary_embedding,
+        scaling_query=scaling_query,
+        scaling_factor=scaling_factor,
+        qk_prod_scaling=qk_prod_scaling,
+        position_bias=position_bias,
+        name="inc_multiquery_self_attention_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig)
+    output_result = inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    embed_dim_value = 64
+    num_q_heads_value = 4
+    num_kv_heads_value = 4
+
+    output_result = test_inc_multiquery_self_attention(
+        ffconfig,
+        input_data,
+        embed_dim=embed_dim_value,
+        num_q_heads=num_q_heads_value,
+        num_kv_heads=num_kv_heads_value,
+        kdim=0,  # Example value for kdim
+        vdim=0,  # Example value for vdim
+        dropout=0.1,  # Example value for dropout
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_FLOAT,
+        kernel_initializer=None,  # Example value for kernel_initializer
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying inc_multiquery_self_attention:")
+    print(output_result)
diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
new file mode 100644
index 0000000000..69a76f68bf
--- /dev/null
+++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multiquery_self_attention_verify(
+        ffconfig,
+        input_arr: np.ndarray,
+        embed_dim: int,
+        num_q_heads: int,
+        num_kv_heads: int,
+        kdim: int = 0,
+        vdim: int = 0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        data_type: DataType = DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding: bool = False,
+        scaling_query: bool = False,
+        scaling_factor: float = 1.0,
+        qk_prod_scaling: bool = True,
+        position_bias: bool = False,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+    inc_multiquery_self_attention_verify_output = ffmodel.inc_multiquery_self_attention_verify(
+        input_tensor,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=kdim,
+        vdim=vdim,
+        dropout=dropout,
+        bias=bias,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+        data_type=data_type,
+        kernel_initializer=kernel_initializer,
+        apply_rotary_embedding=apply_rotary_embedding,
+        scaling_query=scaling_query,
+        scaling_factor=scaling_factor,
+        qk_prod_scaling=qk_prod_scaling,
+        position_bias=position_bias,
+        name="inc_multiquery_self_attention_verify_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    inc_multiquery_self_attention_verify_output.inline_map(ffmodel, ffconfig)
+    output_result = inc_multiquery_self_attention_verify_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    embed_dim_value = 64
+    num_q_heads_value = 4
+    num_kv_heads_value = 4
+
+    output_result = test_inc_multiquery_self_attention_verify(
+        ffconfig,
+        input_data,
+        embed_dim=embed_dim_value,
+        num_q_heads=num_q_heads_value,
+        num_kv_heads=num_kv_heads_value,
+        kdim=0,  # Example value for kdim
+        vdim=0,  # Example value for vdim
+        dropout=0.1,  # Example value for dropout
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_FLOAT,
+        kernel_initializer=None,  # Example value for kernel_initializer
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying inc_multiquery_self_attention_verify:")
+    print(output_result)
diff --git a/examples/python/native/ops/layer_norm.py b/examples/python/native/ops/layer_norm.py
new file mode 100644
index 0000000000..b3cca93d6e
--- /dev/null
+++ b/examples/python/native/ops/layer_norm.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_layer_norm(ffconfig, input_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    layer_norm_output = ffmodel.layer_norm(input_tensor, axes=axes, elementwise_affine=elementwise_affine, eps=eps, use_bias=use_bias, name="layer_norm_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    layer_norm_output.inline_map(ffmodel, ffconfig)
+    layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+    return layer_norm_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    axes_to_normalize = [1, 2]  # Example axes to normalize
+
+    layer_norm_result = test_layer_norm(ffconfig, input_data, axes=axes_to_normalize, elementwise_affine=True, eps=1e-5, use_bias=True)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying layer_norm function along axes {axes_to_normalize}:")
+    print(layer_norm_result)
diff --git a/examples/python/native/ops/max.py b/examples/python/native/ops/max.py
new file mode 100644
index 0000000000..bf9c629406
--- /dev/null
+++ b/examples/python/native/ops/max.py
@@ -0,0 +1,54 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_max(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    max_output = ffmodel.max(input_tensor1, input_tensor2, name="max_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input2.reset()
+
+    dataloader_input1.next_batch(ffmodel)
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    max_output.inline_map(ffmodel, ffconfig)
+    max_result = max_output.get_array(ffmodel, ffconfig)
+
+    return max_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    max_result = test_max(ffconfig, input_data1, input_data2)
+
+    print("Input Array 1:")
+    print(input_data1)
+    print("\nInput Array 2:")
+    print(input_data2)
+    print("\nOutput Array after applying max function:")
+    print(max_result)
diff --git a/examples/python/native/ops/mean.py b/examples/python/native/ops/mean.py
new file mode 100644
index 0000000000..df8c3f642e
--- /dev/null
+++ b/examples/python/native/ops/mean.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_mean(ffconfig, input_arr: np.ndarray, dims: List[int], keepdims: bool = False) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    mean_output = ffmodel.mean(input_tensor, dims=dims, keepdims=keepdims, name="mean_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    mean_output.inline_map(ffmodel, ffconfig)
+    mean_result = mean_output.get_array(ffmodel, ffconfig)
+
+    return mean_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    dims_to_mean = [1, 2]  # Example dimensions to take the mean over
+
+    mean_result = test_mean(ffconfig, input_data, dims=dims_to_mean, keepdims=False)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying mean function along dimensions {dims_to_mean}:")
+    print(mean_result)
diff --git a/examples/python/native/ops/min.py b/examples/python/native/ops/min.py
new file mode 100644
index 0000000000..df81f4f2d2
--- /dev/null
+++ b/examples/python/native/ops/min.py
@@ -0,0 +1,54 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_min(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    min_output = ffmodel.min(input_tensor1, input_tensor2, name="min_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input2.reset()
+
+    dataloader_input1.next_batch(ffmodel)
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    min_output.inline_map(ffmodel, ffconfig)
+    min_result = min_output.get_array(ffmodel, ffconfig)
+
+    return min_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    min_result = test_min(ffconfig, input_data1, input_data2)
+
+    print("Input Array 1:")
+    print(input_data1)
+    print("\nInput Array 2:")
+    print(input_data2)
+    print("\nOutput Array after applying min function:")
+    print(min_result)
diff --git a/examples/python/native/ops/multihead_attention.py b/examples/python/native/ops/multihead_attention.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/multiply.py b/examples/python/native/ops/multiply.py
new file mode 100644
index 0000000000..fb4f489150
--- /dev/null
+++ b/examples/python/native/ops/multiply.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'multiply' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_multiply(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.multiply(input_tensor1, input_tensor2)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    _ = test_multiply(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/pool2d.py b/examples/python/native/ops/pool2d.py
new file mode 100644
index 0000000000..b4dc8b219e
--- /dev/null
+++ b/examples/python/native/ops/pool2d.py
@@ -0,0 +1,36 @@
+# AI generated from conv2d example
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_pool2d(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.pool2d(input_tensor, 3, 3, 1, 1, 0, 0, PoolType.POOL_MAX)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    _ = test_pool2d(ffconfig, input)
\ No newline at end of file
diff --git a/examples/python/native/ops/pow.py b/examples/python/native/ops/pow.py
new file mode 100644
index 0000000000..cf5bbebd80
--- /dev/null
+++ b/examples/python/native/ops/pow.py
@@ -0,0 +1,46 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_pow(ffconfig, input_arr: np.ndarray, exponent: float) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    pow_output = ffmodel.pow(input_tensor, exponent, name="pow_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    pow_output.inline_map(ffmodel, ffconfig)
+    pow_result = pow_output.get_array(ffmodel, ffconfig)
+
+    return pow_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    exponent_value = 2.0  # Example exponent value
+
+    pow_result = test_pow(ffconfig, input_data, exponent=exponent_value)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying pow function with exponent {exponent_value}:")
+    print(pow_result)
diff --git a/examples/python/native/ops/reduce_sum.py b/examples/python/native/ops/reduce_sum.py
new file mode 100644
index 0000000000..7e7b41b799
--- /dev/null
+++ b/examples/python/native/ops/reduce_sum.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_reduce_sum(ffconfig, input_arr: np.ndarray, axes: List[int], keepdims: bool = False) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    reduce_sum_output = ffmodel.reduce_sum(input_tensor, axes=axes, keepdims=keepdims, name="reduce_sum_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    reduce_sum_output.inline_map(ffmodel, ffconfig)
+    reduce_sum_result = reduce_sum_output.get_array(ffmodel, ffconfig)
+
+    return reduce_sum_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    axes_to_reduce = [1, 2]  # Example axes to reduce
+
+    reduce_sum_result = test_reduce_sum(ffconfig, input_data, axes=axes_to_reduce, keepdims=False)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying reduce_sum along axes {axes_to_reduce}:")
+    print(reduce_sum_result)
diff --git a/examples/python/native/ops/relu.py b/examples/python/native/ops/relu.py
new file mode 100644
index 0000000000..d855b27164
--- /dev/null
+++ b/examples/python/native/ops/relu.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'ReLU' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_relu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    # Apply ReLU activation
+    out = ffmodel.relu(input_tensor)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    result = test_relu(ffconfig, input_data)
+
+    print("Input Data:")
+    print(input_data)
+
+    print("\nResult after ReLU activation:")
+    print(result)
diff --git a/examples/python/native/ops/reshape.py b/examples/python/native/ops/reshape.py
new file mode 100644
index 0000000000..348d6bd935
--- /dev/null
+++ b/examples/python/native/ops/reshape.py
@@ -0,0 +1,41 @@
+# The basis for this test of the 'reshape' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_reshape(ffconfig, input_arr: np.ndarray, target_shape: List[int]) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.reshape(input_tensor, target_shape)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    target_shape = [ffconfig.batch_size, 500]
+
+    _ = test_reshape(ffconfig, input, target_shape)
diff --git a/examples/python/native/ops/residual_layer_norm.py b/examples/python/native/ops/residual_layer_norm.py
new file mode 100644
index 0000000000..e12f2e53d9
--- /dev/null
+++ b/examples/python/native/ops/residual_layer_norm.py
@@ -0,0 +1,93 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual1_arr: np.ndarray, residual2_arr: np.ndarray, use_two_residuals: bool, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+    residual1_tensor = ffmodel.create_tensor(residual1_arr.shape, DataType.DT_FLOAT)
+    residual2_tensor = ffmodel.create_tensor(residual2_arr.shape, DataType.DT_FLOAT)
+
+    output_tensor, layer_norm_output = ffmodel.residual_layer_norm(
+        input_tensor,
+        residual1_tensor,
+        residual2_tensor if use_two_residuals else None,
+        use_two_residuals,
+        axes=axes,
+        elementwise_affine=elementwise_affine,
+        eps=eps,
+        use_bias=use_bias,
+        name="residual_layer_norm_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+    dataloader_residual1 = ffmodel.create_data_loader(residual1_tensor, residual1_arr)
+    dataloader_residual2 = ffmodel.create_data_loader(residual2_tensor, residual2_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_residual1.reset()
+    if use_two_residuals:
+        dataloader_residual2.reset()
+
+    dataloader_input.next_batch(ffmodel)
+    dataloader_residual1.next_batch(ffmodel)
+    if use_two_residuals:
+        dataloader_residual2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    output_tensor.inline_map(ffmodel, ffconfig)
+    layer_norm_output.inline_map(ffmodel, ffconfig)
+    output_result = output_tensor.get_array(ffmodel, ffconfig)
+    layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+    return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    residual1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    residual2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    use_two_residuals_flag = True  # Example flag
+
+    axes_to_normalize = [1, 2]  # Example axes to normalize
+
+    output_result, layer_norm_result = test_residual_layer_norm(
+        ffconfig,
+        input_data,
+        residual1_data,
+        residual2_data,
+        use_two_residuals_flag,
+        axes=axes_to_normalize,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nResidual1 Array:")
+    print(residual1_data)
+    if use_two_residuals_flag:
+        print("\nResidual2 Array:")
+        print(residual2_data)
+    print(f"\nOutput Array after applying residual_layer_norm along axes {axes_to_normalize} with use_two_residuals={use_two_residuals_flag}:")
+    print(output_result)
+    print("\nLayer Norm Result:")
+    print(layer_norm_result)
diff --git a/examples/python/native/ops/residual_rms_norm.py b/examples/python/native/ops/residual_rms_norm.py
new file mode 100644
index 0000000000..9027dffada
--- /dev/null
+++ b/examples/python/native/ops/residual_rms_norm.py
@@ -0,0 +1,80 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_residual_rms_norm(
+        ffconfig,
+        input1_arr: np.ndarray,
+        input2_arr: np.ndarray,
+        eps: float,
+        dim: int,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT)
+    input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT)
+
+    residual_rms_norm_output1, residual_rms_norm_output2 = ffmodel.residual_rms_norm(
+        input1_tensor,
+        input2_tensor,
+        eps,
+        dim,
+        name="residual_rms_norm_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr)
+    dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    residual_rms_norm_output1.inline_map(ffmodel, ffconfig)
+    output_result1 = residual_rms_norm_output1.get_array(ffmodel, ffconfig)
+
+    residual_rms_norm_output2.inline_map(ffmodel, ffconfig)
+    output_result2 = residual_rms_norm_output2.get_array(ffmodel, ffconfig)
+
+    return output_result1, output_result2
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    input2_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    eps_value = 1e-6
+    dim_value = 1  # Example value for dim
+
+    output_result1, output_result2 = test_residual_rms_norm(
+        ffconfig,
+        input1_data,
+        input2_data,
+        eps=eps_value,
+        dim=dim_value,
+    )
+
+    print("Input Array 1:")
+    print(input1_data)
+    print("\nInput Array 2:")
+    print(input2_data)
+    print("\nOutput Array 1 after applying residual_rms_norm:")
+    print(output_result1)
+    print("\nOutput Array 2 after applying residual_rms_norm:")
+    print(output_result2)
diff --git a/examples/python/native/ops/reverse.py b/examples/python/native/ops/reverse.py
new file mode 100644
index 0000000000..25394d4b9a
--- /dev/null
+++ b/examples/python/native/ops/reverse.py
@@ -0,0 +1,37 @@
+# The basis for this test of the 'reverse' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_reverse(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.reverse(input_tensor, axis=2)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    _ = test_reverse(ffconfig, input)
diff --git a/examples/python/native/ops/rms_norm.py b/examples/python/native/ops/rms_norm.py
new file mode 100644
index 0000000000..3983d7f891
--- /dev/null
+++ b/examples/python/native/ops/rms_norm.py
@@ -0,0 +1,64 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_rms_norm(
+        ffconfig,
+        input_arr: np.ndarray,
+        eps: float,
+        dim: int,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    rms_norm_output = ffmodel.rms_norm(
+        input_tensor,
+        eps,
+        dim,
+        name="rms_norm_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY],
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    rms_norm_output.inline_map(ffmodel, ffconfig)
+    output_result = rms_norm_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    eps_value = 1e-6
+    dim_value = 1  # Example value for dim
+
+    output_result = test_rms_norm(
+        ffconfig,
+        input_data,
+        eps=eps_value,
+        dim=dim_value,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying rms_norm:")
+    print(output_result)
diff --git a/examples/python/native/ops/rsqrt.py b/examples/python/native/ops/rsqrt.py
new file mode 100644
index 0000000000..3d9ab65449
--- /dev/null
+++ b/examples/python/native/ops/rsqrt.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_rsqrt(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    rsqrt_output = ffmodel.rsqrt(input_tensor, name="rsqrt_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    rsqrt_output.inline_map(ffmodel, ffconfig)
+    rsqrt_result = rsqrt_output.get_array(ffmodel, ffconfig)
+
+    return rsqrt_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    rsqrt_result = test_rsqrt(ffconfig, input_data)
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying rsqrt function:")
+    print(rsqrt_result)
diff --git a/examples/python/native/ops/sampling.py b/examples/python/native/ops/sampling.py
new file mode 100644
index 0000000000..2219f09eff
--- /dev/null
+++ b/examples/python/native/ops/sampling.py
@@ -0,0 +1,55 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sampling(ffconfig, input_arr: np.ndarray, top_p: float, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    sampling_output = ffmodel.sampling(
+        input_tensor,
+        top_p,
+        name="sampling_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+        metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    sampling_output.inline_map(ffmodel, ffconfig)
+    output_result = sampling_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+    top_p_value = 0.8
+
+    output_result = test_sampling(
+        ffconfig,
+        input_data,
+        top_p=top_p_value,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying sampling:")
+    print(output_result)
diff --git a/examples/python/native/ops/scalar_add.py b/examples/python/native/ops/scalar_add.py
new file mode 100644
index 0000000000..48a316ea8a
--- /dev/null
+++ b/examples/python/native/ops/scalar_add.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_add(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    scalar_add_output = ffmodel.scalar_add(
+        input_tensor,
+        scalar,
+        inplace=inplace,
+        name="scalar_add_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    scalar_add_output.inline_map(ffmodel, ffconfig)
+    output_result = scalar_add_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    scalar_value = 2.0  # Example scalar value
+    inplace_flag = True  # Example inplace flag
+
+    output_result = test_scalar_add(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying scalar addition with scalar value {scalar_value} (inplace={inplace_flag}):")
+    print(output_result)
diff --git a/examples/python/native/ops/scalar_multiply.py b/examples/python/native/ops/scalar_multiply.py
new file mode 100644
index 0000000000..ebae5cce01
--- /dev/null
+++ b/examples/python/native/ops/scalar_multiply.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_multiply(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    scalar_multiply_output = ffmodel.scalar_multiply(
+        input_tensor,
+        scalar,
+        inplace=inplace,
+        name="scalar_multiply_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    scalar_multiply_output.inline_map(ffmodel, ffconfig)
+    output_result = scalar_multiply_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    scalar_value = 2.0  # Example scalar value
+    inplace_flag = True  # Example inplace flag
+
+    output_result = test_scalar_multiply(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying scalar multiplication with scalar value {scalar_value} (inplace={inplace_flag}):")
+    print(output_result)
diff --git a/examples/python/native/ops/scalar_sub.py b/examples/python/native/ops/scalar_sub.py
new file mode 100644
index 0000000000..2dc467b573
--- /dev/null
+++ b/examples/python/native/ops/scalar_sub.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_sub(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    scalar_sub_output = ffmodel.scalar_sub(
+        input_tensor,
+        scalar,
+        inplace=inplace,
+        name="scalar_sub_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    scalar_sub_output.inline_map(ffmodel, ffconfig)
+    output_result = scalar_sub_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    scalar_value = 2.0  # Example scalar value
+    inplace_flag = True  # Example inplace flag
+
+    output_result = test_scalar_sub(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying scalar subtraction with scalar value {scalar_value} (inplace={inplace_flag}):")
+    print(output_result)
diff --git a/examples/python/native/ops/scalar_true_divide.py b/examples/python/native/ops/scalar_true_divide.py
new file mode 100644
index 0000000000..f1b64df506
--- /dev/null
+++ b/examples/python/native/ops/scalar_true_divide.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_true_divide(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    scalar_true_divide_output = ffmodel.scalar_true_divide(
+        input_tensor,
+        scalar,
+        inplace=inplace,
+        name="scalar_true_divide_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    scalar_true_divide_output.inline_map(ffmodel, ffconfig)
+    output_result = scalar_true_divide_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    scalar_value = 2.0  # Example scalar value
+    inplace_flag = True  # Example inplace flag
+
+    output_result = test_scalar_true_divide(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+    print("Input Array:")
+    print(input_data)
+    print(f"\nOutput Array after applying scalar true division with scalar value {scalar_value} (inplace={inplace_flag}):")
+    print(output_result)
diff --git a/examples/python/native/ops/sigmoid.py b/examples/python/native/ops/sigmoid.py
new file mode 100644
index 0000000000..0fbe21df45
--- /dev/null
+++ b/examples/python/native/ops/sigmoid.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'Sigmoid' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_sigmoid(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    # Apply Sigmoid activation
+    out = ffmodel.sigmoid(input_tensor)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    result = test_sigmoid(ffconfig, input_data)
+
+    print("Input Data:")
+    print(input_data)
+
+    print("\nResult after Sigmoid activation:")
+    print(result)
diff --git a/examples/python/native/ops/sigmoid_silu_multi.py b/examples/python/native/ops/sigmoid_silu_multi.py
new file mode 100644
index 0000000000..cecc3e102e
--- /dev/null
+++ b/examples/python/native/ops/sigmoid_silu_multi.py
@@ -0,0 +1,58 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sigmoid_silu_multi(ffconfig, input1_arr: np.ndarray, input2_arr: np.ndarray, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT)
+    input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT)
+
+    sigmoid_silu_multi_output = ffmodel.sigmoid_silu_multi(
+        input1_tensor,
+        input2_tensor,
+        name="sigmoid_silu_multi_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr)
+    dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input2.reset()
+
+    dataloader_input1.next_batch(ffmodel)
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    sigmoid_silu_multi_output.inline_map(ffmodel, ffconfig)
+    output_result = sigmoid_silu_multi_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    output_result = test_sigmoid_silu_multi(ffconfig, input1_data, input2_data)
+
+    print("Input1 Array:")
+    print(input1_data)
+    print("\nInput2 Array:")
+    print(input2_data)
+    print("\nOutput Array after applying sigmoid_silu_multi:")
+    print(output_result)
diff --git a/examples/python/native/ops/sin.py b/examples/python/native/ops/sin.py
new file mode 100644
index 0000000000..4b60a4e1d4
--- /dev/null
+++ b/examples/python/native/ops/sin.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sin(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    sin_output = ffmodel.sin(input_tensor, name="sin_layer")
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    sin_output.inline_map(ffmodel, ffconfig)
+    sin_result = sin_output.get_array(ffmodel, ffconfig)
+
+    return sin_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    sin_result = test_sin(ffconfig, input_data)
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying sin function:")
+    print(sin_result)
diff --git a/examples/python/native/ops/softmax.py b/examples/python/native/ops/softmax.py
new file mode 100644
index 0000000000..b5481bcc80
--- /dev/null
+++ b/examples/python/native/ops/softmax.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'Softmax' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_softmax(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    # Apply Softmax activation
+    out = ffmodel.softmax(input_tensor)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10).astype(np.float32)
+
+    result = test_softmax(ffconfig, input_data)
+
+    print("Input Data:")
+    print(input_data)
+
+    print("\nResult after Softmax activation:")
+    print(result)
diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py
new file mode 100644
index 0000000000..bd1aaa189b
--- /dev/null
+++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_spec_inc_multihead_self_attention(
+        ffconfig,
+        input_arr: np.ndarray,
+        embed_dim: int,
+        num_heads: int,
+        kdim: int = 0,
+        vdim: int = 0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        data_type: DataType = DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding: bool = False,
+        scaling_query: bool = False,
+        scaling_factor: float = 1.0,
+        qk_prod_scaling: bool = True,
+        position_bias: bool = False,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+    spec_inc_multihead_self_attention_output = ffmodel.spec_inc_multihead_self_attention(
+        input_tensor,
+        embed_dim,
+        num_heads,
+        kdim=kdim,
+        vdim=vdim,
+        dropout=dropout,
+        bias=bias,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+        data_type=data_type,
+        kernel_initializer=kernel_initializer,
+        apply_rotary_embedding=apply_rotary_embedding,
+        scaling_query=scaling_query,
+        scaling_factor=scaling_factor,
+        qk_prod_scaling=qk_prod_scaling,
+        position_bias=position_bias,
+        name="spec_inc_multihead_self_attention_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    spec_inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig)
+    output_result = spec_inc_multihead_self_attention_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    embed_dim_value = 64
+    num_heads_value = 8
+
+    output_result = test_spec_inc_multihead_self_attention(
+        ffconfig,
+        input_data,
+        embed_dim=embed_dim_value,
+        num_heads=num_heads_value,
+        kdim=0,  # Example value for kdim
+        vdim=0,  # Example value for vdim
+        dropout=0.1,  # Example value for dropout
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_FLOAT,
+        kernel_initializer=None,  # Example value for kernel_initializer
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying spec_inc_multihead_self_attention:")
+    print(output_result)
diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
new file mode 100644
index 0000000000..0b731c99e0
--- /dev/null
+++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_spec_inc_multiquery_self_attention(
+        ffconfig,
+        input_arr: np.ndarray,
+        embed_dim: int,
+        num_q_heads: int,
+        num_kv_heads: int,
+        kdim: int = 0,
+        vdim: int = 0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        data_type: DataType = DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding: bool = False,
+        scaling_query: bool = False,
+        scaling_factor: float = 1.0,
+        qk_prod_scaling: bool = True,
+        position_bias: bool = False,
+        name=None,
+):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+    spec_inc_multiquery_self_attention_output = ffmodel.spec_inc_multiquery_self_attention(
+        input_tensor,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=kdim,
+        vdim=vdim,
+        dropout=dropout,
+        bias=bias,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+        data_type=data_type,
+        kernel_initializer=kernel_initializer,
+        apply_rotary_embedding=apply_rotary_embedding,
+        scaling_query=scaling_query,
+        scaling_factor=scaling_factor,
+        qk_prod_scaling=qk_prod_scaling,
+        position_bias=position_bias,
+        name="spec_inc_multiquery_self_attention_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    spec_inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig)
+    output_result = spec_inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+    embed_dim_value = 64
+    num_q_heads_value = 4
+    num_kv_heads_value = 4
+
+    output_result = test_spec_inc_multiquery_self_attention(
+        ffconfig,
+        input_data,
+        embed_dim=embed_dim_value,
+        num_q_heads=num_q_heads_value,
+        num_kv_heads=num_kv_heads_value,
+        kdim=0,  # Example value for kdim
+        vdim=0,  # Example value for vdim
+        dropout=0.1,  # Example value for dropout
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_FLOAT,
+        kernel_initializer=None,  # Example value for kernel_initializer
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying spec_inc_multiquery_self_attention:")
+    print(output_result)
diff --git a/examples/python/native/ops/split.py b/examples/python/native/ops/split.py
new file mode 100644
index 0000000000..d03a52a769
--- /dev/null
+++ b/examples/python/native/ops/split.py
@@ -0,0 +1,47 @@
+# The basis for this test of the 'split' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_split(ffconfig, input_arr: np.ndarray) -> List[flexflow.core.Tensor]:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out1, out2 = ffmodel.split(input_tensor, 2, axis=1)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out1.inline_map(ffmodel, ffconfig)
+    out2.inline_map(ffmodel, ffconfig)
+
+    return [out1.get_array(ffmodel, ffconfig), out2.get_array(ffmodel, ffconfig)]
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 10, 10, 10).astype(np.float32)
+    output_list = test_split(ffconfig, input)
+
+    print("Output Tensor 1:")
+    print(output_list[0])
+
+    print("\nOutput Tensor 2:")
+    print(output_list[1])
diff --git a/examples/python/native/ops/subtract.py b/examples/python/native/ops/subtract.py
new file mode 100644
index 0000000000..5f829cbae1
--- /dev/null
+++ b/examples/python/native/ops/subtract.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'subtract' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_subtract(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.subtract(input_tensor1, input_tensor2)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    _ = test_subtract(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/tanh.py b/examples/python/native/ops/tanh.py
new file mode 100644
index 0000000000..ba4ba7d6ff
--- /dev/null
+++ b/examples/python/native/ops/tanh.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'tanh' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_tanh(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    # Apply tanh activation
+    out = ffmodel.tanh(input_tensor)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    result = test_tanh(ffconfig, input_data)
+
+    print("Input Data:")
+    print(input_data)
+
+    print("\nResult after tanh activation:")
+    print(result)
diff --git a/examples/python/native/ops/transpose.py b/examples/python/native/ops/transpose.py
new file mode 100644
index 0000000000..6f514d660c
--- /dev/null
+++ b/examples/python/native/ops/transpose.py
@@ -0,0 +1,38 @@
+# The basis for this test of the 'transpose' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_transpose(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.transpose(input_tensor, [ffconfig.batch_size, 10, 5, 10])
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    _ = test_transpose(ffconfig, input)

From a83effedd6e0185a7e8225f445c0aaba840c1aca Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 20 Jun 2024 04:08:29 +0000
Subject: [PATCH 10/44] add code to keep runners registered

---
 .github/workflows/docker-build.yml | 41 ++++++++++++++++++++----------
 .github/workflows/gpu-ci.yml       | 24 +++++++++++++++++
 2 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index d16179434b..eeaab0e0af 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -9,9 +9,9 @@ on:
     branches:
       - "inference"
       - "master"
-  # schedule:
-  #   # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
-  #   - cron: "0 8 * * 0"
+  schedule:
+    # At 00:00 on day-of-month 1, 14, and 28.
+    - cron: "0 0 1,14,28 * *" 
   workflow_dispatch:
 
 # Cancel outdated workflows if they are still running
@@ -58,13 +58,28 @@ jobs:
 
       - name: Check availability of flexflow modules in Python
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
-  
+
+  keep-runner-registered:
+    name: Keep runner alive
+    if: ${{ github.event_name == 'schedule' }}
+    runs-on: [self-hosted, rocm_builder]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"    
+    needs: rocm-builder-start
+    steps:
+      - name: Keep alive
+        run: |
+          echo "Keep self-hosted runner registered with Github"
+          sleep 10m  
 
   docker-build-and-publish-rocm:
     name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
     needs: rocm-builder-start
     runs-on: [self-hosted, rocm_builder]
-    if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+    if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     strategy:
       matrix:
         hip_version: ["5.3", "5.4", "5.5", "5.6"]
@@ -106,19 +121,19 @@ jobs:
       cuda_version: ${{ matrix.cuda_version }}
     steps:
       - name: Checkout Git Repository
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Build Docker container
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         env:
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
           build_needed: ${{ matrix.cuda_version == '12.0' }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
@@ -133,11 +148,11 @@ jobs:
           fi
 
       - name: Check availability of flexflow modules in Python
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
 
       - name: Publish Docker environment image (on push to inference)
-        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
         run: |
@@ -145,7 +160,7 @@ jobs:
           ./docker/publish.sh flexflow
 
   rocm-builder-stop:
-    needs: docker-build-and-publish-rocm
+    needs: [docker-build-and-publish-rocm, keep-runner-registered]
     if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     runs-on: ubuntu-latest
     name: Stop the AWS instance we used to build the ROCM Docker images
@@ -166,7 +181,7 @@ jobs:
     name: Notify Slack in case of failure
     runs-on: ubuntu-20.04
     needs: [docker-build-cuda, docker-build-and-publish-rocm]
-    if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
+    if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }}
     steps:
       - name: Send Slack message
         env:
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 7bdb6805a8..c7d0cd72cb 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -1,5 +1,7 @@
 name: "gpu-ci"
 on:
+  schedule:
+    - cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28.
   push:
     branches:
       - "inference"
@@ -43,8 +45,28 @@ jobs:
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py
 
+  keep-runner-registered:
+    name: Keep runner alive
+    if: ${{ github.event_name == 'schedule' }}
+    runs-on: [self-hosted, gpu]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"    
+    needs: gpu-ci-concierge
+    container:
+      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      options: --gpus all --shm-size=8192m
+    steps:
+      - name: Keep alive
+        run: |
+          echo "Keep self-hosted runner registered with Github"
+          sleep 10m
+  
   python-interface-check:
     name: Check Python Interface
+    if: ${{ github.event_name != 'schedule' }}
     runs-on: [self-hosted, gpu]
     defaults:
       run:
@@ -119,6 +141,7 @@ jobs:
 
   inference-tests:
     name: Inference Tests
+    if: ${{ github.event_name != 'schedule' }}
     runs-on: [self-hosted, gpu]
     defaults:
       run:
@@ -195,6 +218,7 @@ jobs:
 
   training-tests:
     name: Training Tests
+    if: ${{ github.event_name != 'schedule' }}
     runs-on: [self-hosted, gpu]
     # skip this time-consuming test for PRs to the inference branch
     # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}

From 4f82aaed6317cef0a2587848a3b6d57f1d709381 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 10 Jul 2024 23:15:28 -0400
Subject: [PATCH 11/44] fix docker

---
 docker/flexflow-environment/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 6ca337f58d..cef619ad68 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -37,6 +37,7 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
         bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
         rm ~/${MINICONDA_SCRIPT_NAME} && \
+	/opt/conda/bin/conda config --set solver classic && \
         /opt/conda/bin/conda upgrade --all && \
         /opt/conda/bin/conda install conda-build conda-verify && \
         /opt/conda/bin/conda clean -ya

From 25fb40772f587892510bfe0ca296ae54768ff35c Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Thu, 11 Jul 2024 15:16:40 -0400
Subject: [PATCH 12/44] [Tokenizer] update tokenizers-cpp repo

---
 deps/tokenizers-cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp
index 4f42c9fa74..c0fab1e14a 160000
--- a/deps/tokenizers-cpp
+++ b/deps/tokenizers-cpp
@@ -1 +1 @@
-Subproject commit 4f42c9fa74946d70af86671a3804b6f2433e5dac
+Subproject commit c0fab1e14a9421c1501acee5b7703e5dafa60479

From 6a1a1886909fc864aadfb10823077f94fe03b72e Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sat, 3 Aug 2024 08:31:37 -0700
Subject: [PATCH 13/44] minor bug fix (#1456)

---
 .../ops/kernels/inc_multihead_self_attention_kernels.h         | 3 ++-
 src/ops/attention.cu                                           | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9bf2f581e2..26dcf12425 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -56,7 +56,8 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
                                     int num_heads,
                                     int num_kv_heads,
                                     bool scaling_query,
-                                    float scaling_factor);
+                                    float scaling_factor,
+                                    int hidden_size);
 
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 template <typename DT>
diff --git a/src/ops/attention.cu b/src/ops/attention.cu
index 9b8b90da70..18fc810aed 100644
--- a/src/ops/attention.cu
+++ b/src/ops/attention.cu
@@ -206,7 +206,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
   checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc));
   // Currently do not support adding bias to key/value projection
   assert(!attn->add_bias_kv);
-  cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE;
+  unsigned attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE;
   // Assume no beam search for now
   int maxBeamSize = 1;
   // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d)

From 9784b5c6516bafe272fc6555daaa9b867a5eacfa Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 12 Aug 2024 11:02:49 -0700
Subject: [PATCH 14/44] update legion version (#1307)

* update legion version

* legion version update

* update legion version
---
 CMakeLists.txt                               | 2 +-
 deps/legion                                  | 2 +-
 examples/cpp/AlexNet/alexnet.cc              | 2 +-
 examples/cpp/DLRM/dlrm.cc                    | 2 +-
 examples/cpp/InceptionV3/inception.cc        | 2 +-
 examples/cpp/ResNet/resnet.cc                | 2 +-
 examples/cpp/Transformer/transformer.cc      | 2 +-
 examples/cpp/XDL/xdl.cc                      | 2 +-
 examples/cpp/candle_uno/candle_uno.cc        | 2 +-
 examples/cpp/mixture_of_experts/moe.cc       | 2 +-
 examples/cpp/resnext50/resnext.cc            | 2 +-
 examples/cpp/split_test/split_test.cc        | 2 +-
 examples/cpp/split_test_2/split_test_2.cc    | 2 +-
 include/flexflow/graph.h                     | 2 +-
 include/flexflow/operator.h                  | 4 +++-
 include/flexflow/utils/recursive_logger.h    | 4 ++--
 inference/incr_decoding/incr_decoding.cc     | 2 +-
 inference/spec_infer/spec_infer.cc           | 2 +-
 src/mapper/mapper.cc                         | 7 ++++++-
 src/ops/beam_topk.cpp                        | 2 +-
 src/ops/beam_topk.cu                         | 2 +-
 src/ops/inc_multihead_self_attention.cc      | 2 +-
 src/ops/tree_inc_multihead_self_attention.cc | 2 +-
 src/runtime/batch_config.cc                  | 2 +-
 src/runtime/beam_search_batch_config.cc      | 2 +-
 src/runtime/graph.cc                         | 4 ++--
 src/runtime/inference_manager.cc             | 4 ++--
 src/runtime/model.cc                         | 6 ++++--
 src/runtime/optimizer_kernel.cpp             | 4 ++--
 src/runtime/optimizer_kernel.cu              | 2 +-
 src/runtime/request_manager.cc               | 2 +-
 src/runtime/simulator.cc                     | 8 ++++----
 src/runtime/substitution.cc                  | 4 ++--
 src/runtime/tree_verify_batch_config.cc      | 2 +-
 tests/ops/batch_matmul_test.cc               | 2 +-
 tests/ops/concat_test.cc                     | 2 +-
 tests/ops/flat_test.cc                       | 2 +-
 tests/ops/linear_test.cc                     | 2 +-
 tests/ops/reshape_test.cc                    | 2 +-
 tests/ops/tanh_test.cc                       | 2 +-
 tests/ops/transpose_test.cc                  | 2 +-
 41 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43ce4f7044..7079fdadb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -425,7 +425,7 @@ if(NOT BUILD_LEGION_ONLY)
       # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
       add_custom_command(TARGET flexflow
         POST_BUILD	
-        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+        COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
       )
       # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
diff --git a/deps/legion b/deps/legion
index 24e8c45234..02eb1010ca 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835
+Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0
diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc
index 128496eab1..3507882329 100644
--- a/examples/cpp/AlexNet/alexnet.cc
+++ b/examples/cpp/AlexNet/alexnet.cc
@@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("AlexNet");
+Legion::Logger log_app("AlexNet");
 
 void parse_input_args(char **argv, int argc, AlexNetConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc
index 7dc49215b3..d7dc167557 100644
--- a/examples/cpp/DLRM/dlrm.cc
+++ b/examples/cpp/DLRM/dlrm.cc
@@ -19,7 +19,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("DLRM");
+Legion::Logger log_app("DLRM");
 
 void parse_input_args(char **argv, int argc, DLRMConfig &apConfig);
 
diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc
index b2070cc52d..6d0fa7ee53 100644
--- a/examples/cpp/InceptionV3/inception.cc
+++ b/examples/cpp/InceptionV3/inception.cc
@@ -21,7 +21,7 @@
 using namespace Legion;
 using namespace FlexFlow;
 
-LegionRuntime::Logger::Category log_app("Inceptionv3");
+Legion::Logger log_app("Inceptionv3");
 
 Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) {
   Tensor t1 = input;
diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc
index 455eb743ae..49ce934a6a 100644
--- a/examples/cpp/ResNet/resnet.cc
+++ b/examples/cpp/ResNet/resnet.cc
@@ -24,7 +24,7 @@ using FlexFlow::Optimizer;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("ResNet");
+Legion::Logger log_app("ResNet");
 
 void parse_input_args(char **argv, int argc, ResNetConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc
index d61a63cd03..b04093b0a9 100644
--- a/examples/cpp/Transformer/transformer.cc
+++ b/examples/cpp/Transformer/transformer.cc
@@ -17,7 +17,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("Transformer");
+Legion::Logger log_app("Transformer");
 
 Tensor create_emb(FFModel *model,
                   Tensor const &input,
diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc
index 2e6c3cec98..a2272f36e5 100644
--- a/examples/cpp/XDL/xdl.cc
+++ b/examples/cpp/XDL/xdl.cc
@@ -18,7 +18,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("XDL");
+Legion::Logger log_app("XDL");
 
 void parse_input_args(char **argv, int argc, XDLConfig &apConfig);
 
diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc
index 779b8e9c14..e9f4bf876a 100644
--- a/examples/cpp/candle_uno/candle_uno.cc
+++ b/examples/cpp/candle_uno/candle_uno.cc
@@ -21,7 +21,7 @@
 using namespace Legion;
 using namespace std;
 
-LegionRuntime::Logger::Category log_app("Candle_Uno");
+Legion::Logger log_app("Candle_Uno");
 
 void parse_input_args(char **argv, int argc, CandleConfig &apConfig);
 
diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc
index a707310885..a25f94abd9 100644
--- a/examples/cpp/mixture_of_experts/moe.cc
+++ b/examples/cpp/mixture_of_experts/moe.cc
@@ -20,7 +20,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("MoE");
+Legion::Logger log_app("MoE");
 
 void parse_input_args(char **argv, int argc, MoeConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc
index 3c28ca27b8..9b71b37cce 100644
--- a/examples/cpp/resnext50/resnext.cc
+++ b/examples/cpp/resnext50/resnext.cc
@@ -7,7 +7,7 @@ using FlexFlow::Optimizer;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("resnext");
+Legion::Logger log_app("resnext");
 
 Tensor resnext_block(FFModel &ff,
                      Tensor input,
diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc
index 97b98c3214..ac9d516a59 100644
--- a/examples/cpp/split_test/split_test.cc
+++ b/examples/cpp/split_test/split_test.cc
@@ -3,7 +3,7 @@
 using namespace Legion;
 using namespace FlexFlow;
 
-LegionRuntime::Logger::Category log_app("split_test");
+Legion::Logger log_app("split_test");
 
 void FlexFlow::top_level_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc
index 69385d14cb..fef078adbc 100644
--- a/examples/cpp/split_test_2/split_test_2.cc
+++ b/examples/cpp/split_test_2/split_test_2.cc
@@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph;
 using FlexFlow::PCG::GraphSearchHelper;
 using FlexFlow::PCG::Node;
 
-LegionRuntime::Logger::Category log_app("split_test_2");
+Legion::Logger log_app("split_test_2");
 
 void top_level_task(Task const *task,
                     std::vector<PhysicalRegion> const &regions,
diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h
index 2e0cf1ca4b..9dc6572593 100644
--- a/include/flexflow/graph.h
+++ b/include/flexflow/graph.h
@@ -24,7 +24,7 @@
 #include "legion/legion_utilities.h"
 #include <unordered_set>
 
-extern LegionRuntime::Logger::Category log_dp;
+extern Legion::Logger log_dp;
 
 namespace FlexFlow::PCG {
 
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 1b19bdb82f..311699d926 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -19,7 +19,7 @@
 
 namespace FlexFlow {
 
-extern LegionRuntime::Logger::Category log_measure;
+extern Legion::Logger log_measure;
 
 class OpMeta;
 class Simulator;
@@ -233,6 +233,8 @@ class Op {
                                       std::vector<ParallelTensor> const &,
                                       MachineView const *mv = nullptr) {
     assert(false);
+    Legion::FutureMap empty_map;
+    return empty_map;
   };
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
diff --git a/include/flexflow/utils/recursive_logger.h b/include/flexflow/utils/recursive_logger.h
index 2c43b42309..d073f58f3e 100644
--- a/include/flexflow/utils/recursive_logger.h
+++ b/include/flexflow/utils/recursive_logger.h
@@ -26,7 +26,7 @@ class DepthTag {
 
 class RecursiveLogger {
 public:
-  /* RecursiveLogger(LegionRuntime::Logger::Category const &); */
+  /* RecursiveLogger(Legion::Logger const &); */
   RecursiveLogger(std::string const &category_name);
 
   Realm::LoggerMessage info();
@@ -42,7 +42,7 @@ class RecursiveLogger {
 
   void print_prefix(Realm::LoggerMessage &) const;
 
-  LegionRuntime::Logger::Category logger;
+  Legion::Logger logger;
 };
 
 };     // namespace FlexFlow
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index aae7256ffe..ec3dda3158 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -28,7 +28,7 @@ using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
-LegionRuntime::Logger::Category log_app("llama");
+Legion::Logger log_app("llama");
 
 struct FilePaths {
   std::string cache_folder_path;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index f7edfd7696..60233ac8d1 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -26,7 +26,7 @@ using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
-LegionRuntime::Logger::Category log_app("llama");
+Legion::Logger log_app("llama");
 
 struct FilePaths {
   std::string cache_folder_path;
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index c293aecb19..4413d516ac 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -20,7 +20,7 @@ namespace FlexFlow {
 using namespace Legion;
 using namespace Mapping;
 
-LegionRuntime::Logger::Category log_ff_mapper("Mapper");
+Legion::Logger log_ff_mapper("Mapper");
 
 FFShardingFunctor::FFShardingFunctor(int _gpus_per_node,
                                      int _cpus_per_node,
@@ -296,6 +296,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
     // control replicate top level task
     if (enable_control_replication) {
       output.replicate = true;
+      output.map_locally = false;
     }
     return;
   }
@@ -560,6 +561,10 @@ void FFMapper::map_task(const MapperContext ctx,
       assert(output.target_procs[i].address_space() == node_id);
     }
   }
+  if (input.shard_processor.exists()) {
+    output.target_procs = std::vector<Processor>{input.shard_processor};
+  }
+
   // Find instances that still need to be mapped
   std::vector<std::set<FieldID>> missing_fields(task.regions.size());
   runtime->filter_instances(ctx,
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
index 18534455a0..8545bea7cb 100644
--- a/src/ops/beam_topk.cpp
+++ b/src/ops/beam_topk.cpp
@@ -25,7 +25,7 @@ using Legion::coord_t;
 enum class HeapType { kMinHeap, kMaxHeap };
 enum class PreferIndices { kLower, kHigher };
 
-LegionRuntime::Logger::Category log_beam_topk("BeamTopK");
+Legion::Logger log_beam_topk("BeamTopK");
 
 template <typename T>
 struct Entry {
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index a958786be3..c24bdf7c74 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -25,7 +25,7 @@ using Legion::coord_t;
 enum class HeapType { kMinHeap, kMaxHeap };
 enum class PreferIndices { kLower, kHigher };
 
-LegionRuntime::Logger::Category log_beam_topk("BeamTopK");
+Legion::Logger log_beam_topk("BeamTopK");
 
 template <typename T>
 struct Entry {
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 7aa3503770..8688585788 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA");
+Legion::Logger log_inc_mha("IncrementalMHA");
 
 bool IncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index d0efb01d54..9b8c88420d 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_tree_verify("TreeVerifyIncMHA");
+Legion::Logger log_tree_verify("TreeVerifyIncMHA");
 
 bool TreeIncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index bd96dbb141..7989b0799e 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -21,7 +21,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_bc("BatchConfig");
+Legion::Logger log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index ff7bf1a819..0509c23afe 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -24,7 +24,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig");
+Legion::Logger log_beam_bc("BeamSearchBatchConfig");
 
 BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() {
   this->beam_width = DEFAULT_BEAM_WIDTH;
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index f8e8240ccf..cf75235ae7 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -66,8 +66,8 @@ namespace FlexFlow::PCG {
 using namespace Legion;
 using FlexFlow::MachineView;
 
-LegionRuntime::Logger::Category log_graph("graph");
-LegionRuntime::Logger::Category log_simplify("graph_simplify");
+Legion::Logger log_graph("graph");
+Legion::Logger log_simplify("graph_simplify");
 
 const Node Node::INVALID_NODE = Node();
 
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 2a94df8b4d..3d299aeedd 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -25,8 +25,8 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_inf_mgr("InferenceManager");
-LegionRuntime::Logger::Category log_offload("Offloading");
+Legion::Logger log_inf_mgr("InferenceManager");
+Legion::Logger log_offload("Offloading");
 
 InferenceManager::InferenceManager() {}
 
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 1fa281777a..5cad628743 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -82,8 +82,8 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_model("Model");
-LegionRuntime::Logger::Category log_measure("measure");
+Legion::Logger log_model("Model");
+Legion::Logger log_measure("measure");
 
 Op::Op(FFModel &model,
        OperatorType otype,
@@ -6748,6 +6748,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(SGD_UPD_NCCL_TASK_ID, "SGD NCCL Update");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<SGDOptimizer::nccl_update_task>(
           registrar, "SGD NCCL Update Task");
@@ -6898,6 +6899,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "NCCL Init Communicators");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
           registrar, "NCCL Init Communicators Task");
diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp
index e71adc87a8..59efaf5256 100644
--- a/src/runtime/optimizer_kernel.cpp
+++ b/src/runtime/optimizer_kernel.cpp
@@ -21,7 +21,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_optimizer("optimizer");
+Legion::Logger log_optimizer("optimizer");
 
 __global__ void sgd_update(size_t count,
                            float lr,
@@ -247,4 +247,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
 }
 #endif
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu
index 5f654fbb5b..df37e3b135 100644
--- a/src/runtime/optimizer_kernel.cu
+++ b/src/runtime/optimizer_kernel.cu
@@ -20,7 +20,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_optimizer("optimizer");
+Legion::Logger log_optimizer("optimizer");
 
 __global__ void sgd_update(size_t count,
                            float lr,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 16513e918a..d21285eef2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -29,7 +29,7 @@ namespace FlexFlow {
 using namespace Legion;
 using tokenizers::Tokenizer;
 
-LegionRuntime::Logger::Category log_req_mgr("RequestManager");
+Legion::Logger log_req_mgr("RequestManager");
 
 std::string LoadBytesFromFile(std::string const &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc
index d943376416..b71af0d47e 100644
--- a/src/runtime/simulator.cc
+++ b/src/runtime/simulator.cc
@@ -31,10 +31,10 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_sim("sim");
-LegionRuntime::Logger::Category log_ps_sim("ps_sim");
-LegionRuntime::Logger::Category log_xfer_sim("xfer_sim");
-LegionRuntime::Logger::Category log_xfer_est("xfer_est");
+Legion::Logger log_sim("sim");
+Legion::Logger log_ps_sim("ps_sim");
+Legion::Logger log_xfer_sim("xfer_sim");
+Legion::Logger log_xfer_est("xfer_est");
 
 // template class std::map<const Op*, ParallelConfig>; // for debugging in gdb
 // template class std::map<const Op*, MachineView>; // for debugging in gdb
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index c0804d6e19..b86964049d 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -54,8 +54,8 @@ namespace FlexFlow::PCG {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_xfers("xfers");
-LegionRuntime::Logger::Category log_xfer_matches("xfer_matches");
+Legion::Logger log_xfers("xfers");
+Legion::Logger log_xfer_matches("xfer_matches");
 
 const TensorX TensorX::NO_TX = TensorX();
 
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index 841c735f59..49d42bb6dd 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -21,7 +21,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig");
+Legion::Logger log_tree_bc("TreeVerifyBatchConfig");
 
 TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {}
 
diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc
index 7931f44129..f61048febf 100644
--- a/tests/ops/batch_matmul_test.cc
+++ b/tests/ops/batch_matmul_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("bmm_test");
+Legion::Logger log_app("bmm_test");
 
 struct BMMTestMeta {
   int m, k, n, d;
diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc
index c67b718e0e..b0489d1adb 100644
--- a/tests/ops/concat_test.cc
+++ b/tests/ops/concat_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("concat_test");
+Legion::Logger log_app("concat_test");
 
 struct ConcatTestMeta {
   int batch_size, i_dim, num_channels, projected_num_channels,
diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc
index 428893a0dc..61de83b6b0 100644
--- a/tests/ops/flat_test.cc
+++ b/tests/ops/flat_test.cc
@@ -7,7 +7,7 @@
 #include <sstream>
 
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Flat_test");
+Legion::Logger log_app("Flat_test");
 
 struct FlatTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc
index 5b65de3a56..7c84ad1078 100644
--- a/tests/ops/linear_test.cc
+++ b/tests/ops/linear_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("linear_test");
+Legion::Logger log_app("linear_test");
 
 struct LinearTestMeta {
   int batch_size, i_dim, num_channels, dense_projection_o_dim,
diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc
index e8f4586b23..a8aa046a64 100644
--- a/tests/ops/reshape_test.cc
+++ b/tests/ops/reshape_test.cc
@@ -6,7 +6,7 @@
 #include <sstream>
 #define PRECISION 16
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Reshape_test");
+Legion::Logger log_app("Reshape_test");
 
 struct ReshapeTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc
index 1c24d96aaf..1e86934f86 100644
--- a/tests/ops/tanh_test.cc
+++ b/tests/ops/tanh_test.cc
@@ -6,7 +6,7 @@
 #include <sstream>
 #define PRECISION 16
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Tanh_test");
+Legion::Logger log_app("Tanh_test");
 
 struct TanhTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc
index 10481aa14f..045f28479c 100644
--- a/tests/ops/transpose_test.cc
+++ b/tests/ops/transpose_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("transpose_test");
+Legion::Logger log_app("transpose_test");
 
 struct TransposeTestMeta {
   int m, k, d;

From f747438f0927ec528d481cfd6b9c7f15465677c9 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com>
Date: Tue, 13 Aug 2024 10:49:54 -0400
Subject: [PATCH 15/44] Managed mem support (#1466)

* feat: fix missed compile definition

* feat: add func  `get_proc_mem` to process memory allocation

* chore: minor

* chore: try to use get_proc_mem

* fix: proc_mem allocation

* feat: switch to use get_proc_mem

* feat: update Realm::Logger definition

* fix: now all memory are allocated by get_proc_mem

* chore: minor

* fix: no memory allocation bugs

* chore: merge file

* chore: don't use ManagedMemory for now
---
 CMakeLists.txt                               |  1 +
 include/flexflow/model.h                     |  1 +
 include/flexflow/ops/batch_norm.h            |  1 +
 include/flexflow/utils/memory_allocator.h    |  2 ++
 src/mapper/mapper.cc                         |  7 ++-----
 src/ops/add_bias_residual_layer_norm.cc      |  5 +----
 src/ops/argmax.cc                            |  5 +----
 src/ops/attention.cc                         |  5 +----
 src/ops/batch_norm.cpp                       |  5 +----
 src/ops/batch_norm.cu                        |  5 +----
 src/ops/beam_topk.cc                         |  5 +----
 src/ops/dropout.cc                           |  5 +----
 src/ops/inc_multihead_self_attention.cc      |  5 +----
 src/ops/layer_norm.cc                        |  5 +----
 src/ops/linear.cc                            |  5 +----
 src/ops/residual_layer_norm.cc               |  5 +----
 src/ops/residual_rms_norm.cc                 |  5 +----
 src/ops/rms_norm.cc                          |  5 +----
 src/ops/sampling.cc                          |  5 +----
 src/ops/sigmoid_silu_multi.cc                |  5 +----
 src/ops/spec_inc_multihead_self_attention.cc |  5 +----
 src/ops/tree_inc_multihead_self_attention.cc |  5 +----
 src/runtime/graph.cc                         |  5 +----
 src/runtime/memory_allocator.cc              | 12 ++++++++++++
 src/runtime/model.cc                         |  4 ++--
 src/runtime/model.cpp                        | 15 +++------------
 src/runtime/model.cu                         | 15 +++------------
 27 files changed, 45 insertions(+), 103 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7079fdadb8..d7a6391e06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -496,6 +496,7 @@ if(NOT BUILD_LEGION_ONLY)
     if(NOT CARGO_RESULT EQUAL 0)
       message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
     endif()
+    set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON)
     add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
     target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
     target_link_libraries(flexflow tokenizers_cpp)
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 95be9ab581..ea64f65a95 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -22,6 +22,7 @@
 #include "flexflow/node.h"
 #include "flexflow/operator_params.h"
 #include "flexflow/utils/hash_utils.h"
+#include "flexflow/utils/memory_allocator.h"
 #include "flexflow/utils/tuple.h"
 #include "initializer.h"
 #include "layer.h"
diff --git a/include/flexflow/ops/batch_norm.h b/include/flexflow/ops/batch_norm.h
index c923dc1097..01cc0e16ec 100644
--- a/include/flexflow/ops/batch_norm.h
+++ b/include/flexflow/ops/batch_norm.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_BATCH_NORM_H
 
 #include "flexflow/model.h"
+#include "flexflow/utils/memory_allocator.h"
 
 namespace FlexFlow {
 
diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h
index 8e50a4c3b3..7091b159b2 100644
--- a/include/flexflow/utils/memory_allocator.h
+++ b/include/flexflow/utils/memory_allocator.h
@@ -62,6 +62,8 @@ class MemoryAllocator {
   size_t instance_total_size, instance_allocated_size;
 };
 
+Legion::Memory get_proc_mem(Legion::Machine machine, Legion::Processor proc);
+
 }; // namespace FlexFlow
 
 #endif // _FLEXFLOW_RUNTIME_H_
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index 4413d516ac..d7b9a5e99d 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/mapper.h"
+#include "flexflow/utils/memory_allocator.h"
 
 namespace FlexFlow {
 
@@ -81,11 +82,7 @@ FFMapper::FFMapper(MapperRuntime *rt,
       if (it->address_space() == node_id) {
         local_gpus.push_back(*it);
       }
-      Machine::MemoryQuery fb_query(machine);
-      fb_query.only_kind(Memory::GPU_FB_MEM);
-      fb_query.best_affinity_to(*it);
-      assert(fb_query.count() == 1);
-      proc_fbmems[*it] = *(fb_query.begin());
+      proc_fbmems[*it] = get_proc_mem(machine, *it);
       Machine::MemoryQuery zc_query(machine);
       zc_query.only_kind(Memory::Z_COPY_MEM);
       zc_query.has_affinity_to(*it);
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index e670380901..a17e156f18 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -493,10 +493,7 @@ OpMeta *AddBiasResidualLayerNorm::init_task(
     Runtime *runtime) {
   AddBiasResidualLayerNorm *ln = (AddBiasResidualLayerNorm *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   AddBiasResidualLayerNormMeta *meta =
       new AddBiasResidualLayerNormMeta(handle, ln, gpu_mem_allocator);
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index a52ce1886b..1892ac2353 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -233,10 +233,7 @@ OpMeta *ArgMax::init_task(Task const *task,
       ctx, task->regions[1].region.get_index_space());
   int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1;
   int batch_size = acc_input.domain.get_volume() / length;
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
 
   ArgMaxMeta *m = new ArgMaxMeta(handle,
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index 97afc94341..203662d3ec 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -514,10 +514,7 @@ OpMeta *
          acc_output.rect.hi[1] - acc_output.rect.lo[1] + 1);
   assert(attn->oProjSize == acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1);
 
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MultiHeadAttentionMeta *m =
       new MultiHeadAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads);
   m->profiling = attn->profiling;
diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp
index 106e5ebad2..7dee6fdaaf 100644
--- a/src/ops/batch_norm.cpp
+++ b/src/ops/batch_norm.cpp
@@ -61,10 +61,7 @@ __host__ OpMeta *
   int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1;
   int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1;
 
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   BatchNormMeta *m = new BatchNormMeta(
       handle, bm, gpu_mem, output_n, output_c, output_h, output_w);
   return m;
diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu
index b77e9d489f..929ebf81f8 100644
--- a/src/ops/batch_norm.cu
+++ b/src/ops/batch_norm.cu
@@ -58,10 +58,7 @@ __host__ OpMeta *
   int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1;
   int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1;
 
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   BatchNormMeta *m = new BatchNormMeta(
       handle, bm, gpu_mem, output_n, output_c, output_h, output_w);
   return m;
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index d2054cacb0..5f4547ace5 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -271,10 +271,7 @@ OpMeta *BeamTopK::init_task(Task const *task,
                             Runtime *runtime) {
   BeamTopK *topk = (BeamTopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator);
   m->profiling = topk->profiling;
diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc
index 58cb82d53d..190d6fd496 100644
--- a/src/ops/dropout.cc
+++ b/src/ops/dropout.cc
@@ -164,10 +164,7 @@ OpMeta *Dropout::init_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   assert(input_domain == output_domain);
   DropoutMeta *m = new DropoutMeta(handle, dropout, gpu_mem, output_domain);
   std::strcpy(m->op_name, dropout->name);
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 8688585788..aa60d0f19c 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -698,10 +698,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
 
   assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   if (attn->offload) {
     // cpu-offload enabled
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 2218ffe392..b19f400eb2 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -380,10 +380,7 @@ OpMeta *LayerNorm::init_task(Task const *task,
                              Runtime *runtime) {
   LayerNorm *ln = (LayerNorm *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   LayerNormMeta *meta = new LayerNormMeta(handle, ln, gpu_mem_allocator);
   std::strcpy(meta->op_name, ln->name);
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 0c7a0f78fe..44b56d623e 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -480,10 +480,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
   //        in_dim,
   //        out_dim,
   //        batch_size);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   if (linear->offload) {
     // cpu-offload enabled
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index ed9252c309..8dd670eea3 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -489,10 +489,7 @@ OpMeta *ResidualLayerNorm::init_task(Task const *task,
                                      Runtime *runtime) {
   ResidualLayerNorm *ln = (ResidualLayerNorm *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   ResidualLayerNormMeta *meta =
       new ResidualLayerNormMeta(handle, ln, gpu_mem_allocator);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index f4f5bb72d0..b3ee7179d0 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -347,10 +347,7 @@ OpMeta *ResidualRMSNorm::init_task(Task const *task,
                                    Runtime *runtime) {
   ResidualRMSNorm *rn = (ResidualRMSNorm *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   ResidualRMSNormMeta *meta =
       new ResidualRMSNormMeta(handle, rn, gpu_mem_allocator);
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index bf07ee6bb0..79dce65c57 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -294,10 +294,7 @@ OpMeta *RMSNorm::init_task(Task const *task,
                            Runtime *runtime) {
   RMSNorm *rn = (RMSNorm *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   RMSNormMeta *meta = new RMSNormMeta(handle, rn, gpu_mem_allocator);
   std::strcpy(meta->op_name, rn->name);
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index 9fc2316f9a..b38c68843b 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -226,10 +226,7 @@ OpMeta *Sampling::init_task(Task const *task,
 
   int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1;
   int batch_size = acc_input.domain.get_volume() / length;
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   SamplingMeta *m = new SamplingMeta(
       handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator);
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3ddd6b8d6e..3d1c8d9094 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -237,10 +237,7 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task,
                                     Runtime *runtime) {
   SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   SigmoidSiluMultiMeta *meta =
       new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator);
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 9c6ed0e0b6..68d3a4c205 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -640,10 +640,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   int num_kv_heads = attn->num_kv_heads;
   assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   // We don't do offloading for SSMs (small speculative models)
   SpecIncMultiHeadSelfAttentionMeta *m =
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 9b8c88420d..df722a3d51 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -697,10 +697,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
 
   assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   if (attn->offload) {
     // cpu-offload enabled
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index cf75235ae7..b023aced6e 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -1914,10 +1914,7 @@ std::pair<std::unique_ptr<Graph>, std::unordered_map<Node, MachineView>>
                                     model->config.workersPerNode,
                                     model->config.cpusPerNode,
                                     model->all_valid_views);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MachineModel *machine;
   if (model->config.machine_model_version == 0) {
     machine =
diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc
index 06a7c468a4..cb4e867165 100644
--- a/src/runtime/memory_allocator.cc
+++ b/src/runtime/memory_allocator.cc
@@ -19,7 +19,9 @@ namespace FlexFlow {
 
 // declare Legion names
 using Legion::coord_t;
+using Legion::Machine;
 using Legion::Memory;
+using Legion::Processor;
 using Realm::RegionInstance;
 
 MemoryAllocator::MemoryAllocator(Memory _memory)
@@ -51,4 +53,14 @@ void MemoryAllocator::register_reserved_work_space(void *base, size_t size) {
   reserved_allocated_size = 0;
 }
 
+// Now it's for allocating FB memory, in the future we can
+// add more types of memory allocation if needed
+Memory get_proc_mem(Machine machine, Processor proc) {
+  Machine::MemoryQuery proc_mem = Machine::MemoryQuery(machine)
+                                      .only_kind(Memory::GPU_FB_MEM)
+                                      .best_affinity_to(proc);
+  assert(proc_mem.count() > 0);
+  return proc_mem.first();
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 5cad628743..f1e222e6e3 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4273,8 +4273,8 @@ void FFConfig::parse_args(char **argv, int argc) {
       workersPerNode = atoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "-ll:fsize")) {
-      device_mem = atoi(argv[++i]);
+    if ((!strcmp(argv[i], "-ll:fsize")) || (!strcmp(argv[i], "-ll:msize"))) {
+      device_mem += atoi(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--nodes")) {
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index ad2b781567..62f6b89b7f 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -112,10 +112,7 @@ FFHandler
   //  handle.workSpace = memFBImpl->get_direct_ptr(offset, 0);
   {
     // allocate memory for workspace
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(handle.workSpaceSize - 1));
@@ -133,10 +130,7 @@ FFHandler
   }
   if (handle.offload_reserve_space_size > 0) {
     // allocate memory for offload reserve space
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1));
@@ -157,10 +151,7 @@ FFHandler
   }
   if (handle.batch_config_metadata_size > 0) {
     // allocate memory for offload reserve space
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 23b7f0efbe..fd39ed0db0 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -108,10 +108,7 @@ FFHandler
   //  handle.workSpace = memFBImpl->get_direct_ptr(offset, 0);
   {
     // allocate memory for workspace
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(handle.workSpaceSize - 1));
@@ -129,10 +126,7 @@ FFHandler
   }
   if (handle.offload_reserve_space_size > 0) {
     // allocate memory for offload reserve space
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1));
@@ -153,10 +147,7 @@ FFHandler
   }
   if (handle.batch_config_metadata_size > 0) {
     // allocate memory for offload reserve space
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));

From 6d710acd79f968f65397874f62b8ebef20590620 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 20 Aug 2024 14:06:52 -0600
Subject: [PATCH 16/44] pip flexflow_python typo (#1461)

Co-authored-by: Zhihao Jia <zhihao@cmu.edu>
---
 python/flexflow/flexflow_python | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/flexflow/flexflow_python b/python/flexflow/flexflow_python
index cf247b9ede..8a9b65a404 100644
--- a/python/flexflow/flexflow_python
+++ b/python/flexflow/flexflow_python
@@ -6,7 +6,7 @@ python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.ge
 pylib_path="$(python "$python_packages"/flexflow/findpylib.py)"
 pylib_dir="$(dirname "$pylib_path")"
 export PATH="${python_packages}/flexflow/bin:${PATH}"
-export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${PATH}"
+export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${LD_LIBRARY_PATH}"
 legion_python_args=("$@" "-ll:py" "1")
 
 legion_python "${legion_python_args[@]}"

From 3b59f0577cc6fc3a109921f72ceadef3458cf635 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 29 Aug 2024 00:04:28 +0200
Subject: [PATCH 17/44] update legion version

---
 deps/legion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/legion b/deps/legion
index 02eb1010ca..0d32b35542 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0
+Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b

From 28aff70cc98d065390eb58b7fd15dcd24f3fb786 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 31 Aug 2024 06:00:57 -0700
Subject: [PATCH 18/44] Fix nccl-induced segfault (#1481)

---
 include/flexflow/model.h       |  1 +
 src/runtime/model.cc           | 68 ++++++++++++++++++----------------
 src/runtime/request_manager.cc |  3 ++
 3 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index ea64f65a95..6dda67bbfe 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1079,6 +1079,7 @@ class FFModel {
                      bool use_propagation) const;
 #ifdef FF_USE_NCCL
   ncclComm_t *find_nccl_comms(MachineView const &view) const;
+  void finish_nccl_comms();
 #endif
 #ifdef FF_USE_PROPAGATE
   void propagate(std::map<Op *, ParallelConfig> const &current,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index f1e222e6e3..4c67de1aa9 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1589,41 +1589,47 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
   model_id = model_counter++;
 }
 
+#ifdef FF_USE_NCCL
+void FFModel::finish_nccl_comms() {
+  Context ctx = config.lg_ctx;
+  Runtime *runtime = config.lg_hlr;
+  for (auto const &comm : view_hash_to_nccl_comms) {
+    // Find the machine view that has the hash
+    MachineView view;
+    for (size_t l = 0; l < operators.size(); l++) {
+      view = operators[l]->outputs[0]->machine_view;
+      if (view.hash() == comm.first) {
+        break;
+      }
+    }
+    assert(view.hash() == comm.first && "Cannot find the machine view");
+    IndexSpace task_is = get_or_create_task_is(view);
+    Domain domain = runtime->get_index_space_domain(ctx, task_is);
+    ArgumentMap argmap;
+    int idx = 0;
+    for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
+      argmap.set_point(*it,
+                       TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
+    }
+    IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
+                                 task_is,
+                                 TaskArgument(nullptr, 0),
+                                 argmap,
+                                 Predicate::TRUE_PRED,
+                                 false /*must*/,
+                                 0 /*mapper_id*/,
+                                 comm.first);
+    FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+    fm.wait_all_results();
+  }
+}
+#endif
+
 FFModel::~FFModel() {
   // Destroy nccl communication groups
 #ifdef FF_USE_NCCL
   if (config.computationMode == COMP_MODE_TRAINING) {
-    Context ctx = config.lg_ctx;
-    Runtime *runtime = config.lg_hlr;
-    for (auto const &comm : view_hash_to_nccl_comms) {
-      // Find the machine view that has the hash
-      MachineView view;
-      for (size_t l = 0; l < operators.size(); l++) {
-        view = operators[l]->outputs[0]->machine_view;
-        if (view.hash() == comm.first) {
-          break;
-        }
-      }
-      assert(view.hash() == comm.first && "Cannot find the machine view");
-      IndexSpace task_is = get_or_create_task_is(view);
-      Domain domain = runtime->get_index_space_domain(ctx, task_is);
-      ArgumentMap argmap;
-      int idx = 0;
-      for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
-        argmap.set_point(*it,
-                         TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
-      }
-      IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
-                                   task_is,
-                                   TaskArgument(nullptr, 0),
-                                   argmap,
-                                   Predicate::TRUE_PRED,
-                                   false /*must*/,
-                                   0 /*mapper_id*/,
-                                   comm.first);
-      FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
-      fm.wait_all_results();
-    }
+    finish_nccl_comms();
   }
 #endif
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d21285eef2..bada87ab19 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2365,6 +2365,9 @@ void RequestManager::background_serving_task(
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+#ifdef FF_USE_NCCL
+  llm->finish_nccl_comms();
+#endif
 }
 
 /*static*/

From 49523d62691039a9a8c29891acc5d48641048cc4 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 2 Sep 2024 03:05:25 -0700
Subject: [PATCH 19/44] Fix python install issue caused by new Legion version
 (#1482)

* fix

* .

* .

* fix

* cleanup

* fix

* cleanup
---
 CMakeLists.txt                   | 20 ++++++++++++++------
 cmake/pip_install/CMakeLists.txt | 20 ++++++++++++++++++--
 pyproject.toml                   |  3 ++-
 requirements.txt                 |  1 +
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7a6391e06..c82a53644e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,13 +37,24 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
       STRING "Choose the type of build." FORCE)
 endif()
 
+# option for using Python
+option(FF_USE_PYTHON "Enable Python" ON)
+if (FF_USE_PYTHON)
+  find_package(Python3 COMPONENTS Interpreter Development)
+endif()
+
 if(INSTALL_DIR)
   message(STATUS "INSTALL_DIR: ${INSTALL_DIR}")
   set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE)
 else()
-  # Install DIR not set. Use default, unless a conda environment is active
-  if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI)
-    set(CONDA_PREFIX $ENV{CONDA_PREFIX})
+  # Install DIR not set. Use default, unless a conda environment is in use
+  if ((DEFINED ENV{CONDA_PREFIX} OR (Python3_EXECUTABLE AND Python3_EXECUTABLE MATCHES "conda")) AND NOT FF_BUILD_FROM_PYPI)
+    if (DEFINED ENV{CONDA_PREFIX})
+        set(CONDA_PREFIX $ENV{CONDA_PREFIX})
+    else()
+        get_filename_component(CONDA_PREFIX "${Python3_EXECUTABLE}" DIRECTORY)
+        get_filename_component(CONDA_PREFIX "${CONDA_PREFIX}" DIRECTORY)
+    endif()
     # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path
     set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE)
     message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
@@ -64,9 +75,6 @@ option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
 # build shared or static flexflow lib
 option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON)
 
-# option for using Python
-option(FF_USE_PYTHON "Enable Python" ON)
-
 # option for building legion only 
 option(BUILD_LEGION_ONLY "Build Legion only" OFF)
 
diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt
index 105133a310..217d7e14f0 100644
--- a/cmake/pip_install/CMakeLists.txt
+++ b/cmake/pip_install/CMakeLists.txt
@@ -2,9 +2,25 @@
 if (FF_USE_PYTHON)
     execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
     if(FF_BUILD_FROM_PYPI)
-        install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
+        cmake_path(SET CMAKE_SOURCE_DIR_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion)
+        cmake_path(SET CMAKE_BUILD_DIR_ NORMALIZE ${Legion_BINARY_DIR}/runtime)
+        cmake_path(SET CMAKE_INSTALL_PREFIX_ NORMALIZE ${PY_DEST}/../../..)
+        cmake_path(SET WORKING_DIRECTORY_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/)
         # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install 
         # Legion_BINARY_DIR=/usr/FlexFlow/build/<something>/deps/legion
-        install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
+        # CMAKE_SOURCE_DIR_=/usr/FlexFlow/deps/legion
+        # CMAKE_BUILD_DIR_: /usr/FlexFlow/build/<something>/deps/legion/runtime
+        # CMAKE_INSTALL_PREFIX_: /opt/conda/ or /usr/local
+        # WORKING_DIRECTORY_: /usr/FlexFlow/deps/legion/bindings/python/
+        # PY_DEST: <path to python lib>/python3.11/site-packages
+        message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
+        message(STATUS "Legion_BINARY_DIR: ${Legion_BINARY_DIR}")
+        message(STATUS "CMAKE_SOURCE_DIR_: ${CMAKE_SOURCE_DIR_}")
+        message(STATUS "CMAKE_BUILD_DIR_: ${CMAKE_BUILD_DIR_}")
+        message(STATUS "CMAKE_INSTALL_PREFIX_: ${CMAKE_INSTALL_PREFIX_}")
+        message(STATUS "WORKING_DIRECTORY_: ${WORKING_DIRECTORY_}")
+        message(STATUS "PY_DEST: ${PY_DEST}")
+        install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${CMAKE_INSTALL_PREFIX_} \")")
+        install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E env CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR_} CMAKE_BUILD_DIR=${CMAKE_BUILD_DIR_} CMAKE_INSTALL_PREFIX=${PY_DEST}/flexflow ${Python3_EXECUTABLE} setup.py install --prefix ${CMAKE_INSTALL_PREFIX_} ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${WORKING_DIRECTORY_} COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)")
     endif()
 endif()
diff --git a/pyproject.toml b/pyproject.toml
index 4b8214f3fe..373c53beb8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ requires = [
     "setuptools_scm[toml]>=6.0",
     "cmake-build-extension",
     "ninja",
-    "requests"
+    "requests",
+    "pip",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 1037661337..ad65622367 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ onnx
 transformers>=4.31.0
 sentencepiece
 einops
+pip

From a0f1ed783e3ef48ac374563cf3f4fc2388f34b4c Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 4 Sep 2024 14:15:06 -0400
Subject: [PATCH 20/44] PEFT support (inference/finetuning) (#1153)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* .

* .

* Update the default cublas behavior when CUDA_VERSION is not specified

* fix bugs in IncMHA peft_bwd kernel

* uncomment softmaxbackward

* add layernorm to align test

* add peft test scripts

* fix import

* fix

* add code to convert peft models

* add script to download peft for c++, fix bug

* fix

* add script to fine-tune models

* implement loading lora configs/weights from file

* remove peft_bwd assertion failure in embedding

* fix download script

* add peft dependencies in dockerfile

* fix softmax backward

* fix bc print indentation

* Temporarily Revert "Update the default cublas behavior when CUDA_VERSION is not specified"

This reverts commit 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a.

* Fix cublas default (#1220)

* Fix Legion prebuild workflow (2) (#1208)

* fix

* fix

* fix

* fix

* Fix Legion prebuild workflow (3) (#1210)

* fix hip error

* use CUBLAS_COMPUTE_FAST_16F for full-precision gemm

---------

Co-authored-by: Zhihao Jia <zhihao@cmu.edu>

* fix bugs, work on align opt-lora

* update scripts

* add code to output peft tensors in hf

* update, fixes

* linting

* fix printing of tensors for numpy

* update save_inference_tensors_to_file

* linting

* update

* fix issue with save_inference_tensors_to_file

* fix layer names for save_inference_tensors_to_file

* fix peft

* fix bwd bugs

* linting

* fixes

* fix

* fix

* fix

* add bc fields for peft training

* linting

* fix

* remove ptr check

* fix

* implement save_operators for bwd

* fix bug

* implement save tensors for bwd

* .

* bug fix

* fix

* align linear

* fix

* bwd kernel updates

* undo use of CUBLAS_COMPUTE_32F_FAST_16F for now

* only send dataset entry once

* update peft test scripts

* loss

* .

* update generate/request api to take both inference and fine-tuning prompts

* linting

* alignment fixes in lora & linear layer

* alignment fix

* diagonal

* fix

* alignment fix ssm

* sigmoid-silu-multi now fully aligned

* rms norm kernel updates

* fix

* in-place residual rms

* bug fix and linting

* align backward of o_proj, attn_heads, qk_prods_softmax, and v_proj with huggingface

* cleanup

* finished all alignment fixes in attention backward kernel

* fix

* Update inc_multihead_self_attention.cu

* Update inc_multihead_self_attention.cu

* use grad to store peft in/output (#1241)

* use grad to store peft in/output

* format

* .

* format

* enable peft request

* several hacks for performance measurement; some of the changes should be reverted

* Update sigmoid_silu_multi.cu

* RoPE backward

* PEFT bug fixes and alignment (#1269)

* Revert "several hacks for performance measurement; some of the changes should be reverted"

This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c.

* backup

* backup

* updates

* update

* backup

* backup

* backup

* fix

* cleanup

* linting

* Fuse bias + relu in OPT (#1271)

* fuse bias and relu in opt

* fix

* fix

* fix

* fix

* Peft alignment & debugging tools (#1288)

* Revert "several hacks for performance measurement; some of the changes should be reverted"

This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c.

* backup

* backup

* updates

* update

* backup

* backup

* backup

* fix

* cleanup

* fix

* fix

* fix

* update

* simplify tensor names

* fix

* fixes and updates

* fixes

* fix

* cleanup

* .

* restore softmax

* cleanup

* update alignment scripts

* newline

* fix legion aliasing error

* fix warnings

* fix

* fix pipeline parallelism

* fix tp issue in combine op

* fix lora weight loading with tensor parallelism

* fixes, implement Combine::peft_bwd_task

* fix

* replicate peft bwd

* fixes

* fix

* fix combine and fwd-bwd pass dependencies

* fix replicate bwd

* fix

* let user control amount of peft memory

* only run peft_bwd if peft is enabled

* fix rms norm inference region reqs

* fix in-place fusion (part 1)

* fix inplace fusion (part 2)

* fix

* disable automatic inplace rms norm for now

* fix inf fusion inplace

* fix rest input grads for peft without inplace residuals

* fix

* fix

* fix residual rms

* fix

* fix

* enable inf debugging in fusion bwd

* hack to silence warning in fused bwd

* fix

* fix

* fix build

* fix

* fix

* add draft peft test

* Peft python interface (#1306)

* update script

* less model renaming

* fix

* fix

* fix

* backup

* .

* update

* .

* fixes

* fix

* fix build

* fix

* fix

* fix issues for downloading peft model

* solved issues for download peft model

* added printouts for debugging

* fix

* fix seg fault

* add test, separate peft script in cpp

* fix

* fixes

* fix

* update peft python interface

* update

* update

* update

* updates

* fix

* fixes

* fix

* fixes

---------

Co-authored-by: april-yyt <aprilytyang@gmail.com>

* fix

* update

* fix

* fix to support prompts larger than max tokens per batch

* fixes to support benchmarking of finetuning throughput

* many upgrades and updates related to finetuning

* add ttft statistics

* add warmup phase

* add benchmarking code

* Add scripts for evaluation with Microsoft Azure trace (#1363)

* Add scripts for evaluation

* Add absolute request rate value

* Fix script for target arrival rate

* Fix cpp req rate benchmark

* update to use new dataset

* Fix infinite loop

* update

* add data

---------

Co-authored-by: Remi Delacourt <rdelacou@catalyst-0-9.eth>
Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>

* fix

* fix

* add peft tests to ci

* shellcheck

* fix

* fix python requirements

* fix

* fix

* update ci test

* update alignment doc

* fix cross entropy loss bug

* update alignment test

* update test

* add llama peft alignment test to ci

* Fix values for unused params in incr_decoding

* Add PEFTModelID NO_ID singleton instead of None

* Fix PEFTModelID::NO_ID reference

* reduce logging

* fix

* fix

* Add peft demo

* Add readme for demo

* fix alignment issue

* Peft optimizer (#1290)

* add optimizer config, only allocate weights for training

* sgd 1

* sgd 2

* update

* fix

* linting

* .

* .

* fix

* fix allreduce bug

* update

* update

* add optimizer hook in hf

* update

* update script

* .

* fix

* fwd

* bwd

* start grads

* fix gradient misalignment!

* update

* Add support for llama3

* various fixes

---------

Co-authored-by: Remi Delacourt <remi.delacourt@gmail.com>

* Optimizers python interface (#1441)

* python interface for optimizer

* update lora linear config to support python interface

* update python interface

* finished lora python interface

* fix

* fix

* update

* update

* more fixes

* fix

* initialize lora weights where needed

* Add notebook

* Update demo to use dataset

* Fix'

* Save weights after end of finetuning (#1446)

* support accumulation of gradients without update

* add code to save peft weights

* fix

* save configs

* cleanup

* Fully use notebook for demo

* Parameterize generation and finetuning configs

* Comment out inference for now

* fix bug in lora inference only mode

* fix

* Add finetuning or inference only flags

* fix

* fix

* fix

* PEFT model upload (#1450)

* upload test

* fix

* Make demo_class.py executable

* fix

* add base_model_name_or_path

* fix

* fix

* support llama-3 tokenizer

* print output tokens when not benchmarking

* Use Llama3 in demo_class

* Use Llama3 in demo

* fix data loading for llama-3

* Add download models to demo

* return/print loss at each finetuning step

* fix

* Adjust demo parameters

* Fix for finetuning

* pass finetuning losses to python interface

* Update demo

* Fix upload

* Refactor demo

* rename demo_class to demo

* fix

* remove epoch from loss print

* Finish demo

* fix test

* rocm fixes

* more rocm fixes

* fix rocm build

* docker fix

* fix inference test

* fix workflow

* fix makefile

* fix peft test

* fix all-reduce issue with lora for TP scenario

* fix bwd lm head

* fixes

* more fixes

* update

* fix alignment up to input ln

* finished aligning all backward (tp>1)

* align all peft

* fix

* fix broken link

* formatting

* fix

* update

* Revert "update"

This reverts commit 90b2c876ca3ea9c29e59aa7ae9904f254298660d.

* update

* fix hip build

* fix gpu ci

* fix gpu ci

* update default gpu ci version to 12.0

* update ci to 12.0

* fix

* fix

* update

* fix

* fix

* update

* fix

* add cleanup

* downgrade to cuda=11.8

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
Co-authored-by: xinhaoc <chengxh_98@163.com>
Co-authored-by: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com>
Co-authored-by: april-yyt <aprilytyang@gmail.com>
Co-authored-by: Remi <54138269+Flechman@users.noreply.github.com>
Co-authored-by: Remi Delacourt <rdelacou@catalyst-0-9.eth>
Co-authored-by: Rémi Delacourt <remi.delacourt@gmail.com>
---
 .github/workflows/build.yml                   |   12 +-
 .github/workflows/gpu-ci.yml                  |   10 +
 .github/workflows/helpers/install_cudnn.sh    |   23 +-
 .github/workflows/helpers/install_nccl.sh     |    8 +-
 .github/workflows/multinode-test.yml          |    6 +-
 .github/workflows/pip-install.yml             |    4 +-
 .github/workflows/prebuild-legion.yml         |    4 +-
 .gitignore                                    |    5 +
 CMakeLists.txt                                |    1 +
 conda/flexflow.yml                            |    7 +
 config/config.inc                             |    2 +-
 docker/build.sh                               |    9 +-
 docker/flexflow-environment/Dockerfile        |    2 +
 docker/run.sh                                 |    2 +-
 include/flexflow/batch_config.h               |   42 +-
 include/flexflow/config.h                     |   41 +-
 include/flexflow/ffconst.h                    |   77 +-
 include/flexflow/fftype.h                     |   25 +
 include/flexflow/flexflow_c.h                 |  136 +-
 include/flexflow/inference.h                  |    1 +
 include/flexflow/layer.h                      |    2 +-
 include/flexflow/model.h                      |   61 +-
 include/flexflow/op_meta.h                    |    6 +-
 include/flexflow/operator.h                   |   95 +-
 include/flexflow/operator_params.h            |    4 +
 .../ops/add_bias_residual_layer_norm.h        |   63 +-
 .../ops/add_bias_residual_layer_norm_params.h |    1 +
 include/flexflow/ops/aggregate.h              |    4 +-
 include/flexflow/ops/aggregate_spec.h         |    4 +-
 include/flexflow/ops/argmax.h                 |   11 +-
 include/flexflow/ops/cache.h                  |    4 +-
 include/flexflow/ops/element_unary.h          |    4 +-
 include/flexflow/ops/embedding.h              |    5 +
 include/flexflow/ops/experts.h                |   17 +-
 include/flexflow/ops/fused.h                  |    9 +
 include/flexflow/ops/groupby.h                |    4 +-
 .../ops/inc_multihead_self_attention.h        |   23 +-
 .../ops/kernels/batch_matmul_kernels.h        |    4 +-
 include/flexflow/ops/kernels/cast_kernels.h   |    4 +-
 include/flexflow/ops/kernels/concat_kernels.h |    4 +-
 .../flexflow/ops/kernels/conv_2d_kernels.h    |    4 +-
 include/flexflow/ops/kernels/flat_kernels.h   |    4 +-
 .../inc_multihead_self_attention_utils.cuh    |   27 +-
 include/flexflow/ops/kernels/linear_kernels.h |   31 +
 .../ops/kernels/lora_linear_kernels.h         |   77 +
 .../flexflow/ops/kernels/pool_2d_kernels.h    |    4 +-
 .../flexflow/ops/kernels/reshape_kernels.h    |    6 +-
 .../ops/kernels/residual_rms_norm_kernels.h   |   30 +-
 .../flexflow/ops/kernels/rms_norm_kernels.h   |   23 +-
 .../flexflow/ops/kernels/softmax_kernels.h    |   46 +-
 .../flexflow/ops/kernels/transpose_kernels.h  |    4 +-
 include/flexflow/ops/layer_norm.h             |   50 +-
 include/flexflow/ops/linear.h                 |    9 +
 include/flexflow/ops/lora_linear.h            |   99 +
 include/flexflow/ops/lora_linear_params.h     |  150 +
 include/flexflow/ops/residual_layer_norm.h    |   41 +-
 .../flexflow/ops/residual_layer_norm_params.h |    1 +
 include/flexflow/ops/residual_rms_norm.h      |   16 +
 .../flexflow/ops/residual_rms_norm_params.h   |    1 +
 include/flexflow/ops/rms_norm.h               |   13 +
 include/flexflow/ops/sigmoid_silu_multi.h     |   33 +-
 include/flexflow/ops/softmax.h                |    9 +
 include/flexflow/ops/topk.h                   |    4 +-
 include/flexflow/ops/transpose.h              |    2 +
 .../ops/tree_inc_multihead_self_attention.h   |    2 +-
 include/flexflow/parallel_ops/allreduce.h     |   19 +-
 include/flexflow/parallel_ops/combine.h       |   13 +
 .../parallel_ops/kernels/allreduce_kernels.h  |   14 +-
 .../parallel_ops/kernels/combine_kernels.h    |    4 +-
 .../kernels/parallel_identity_kernels.h       |   41 +
 .../parallel_ops/kernels/partition_kernels.h  |    4 +-
 .../flexflow/parallel_ops/parallel_identity.h |   83 +
 .../parallel_ops/parallel_identity_params.h   |   22 +
 include/flexflow/parallel_ops/parallel_op.h   |    2 +-
 include/flexflow/parallel_ops/replicate.h     |    9 +
 include/flexflow/request_manager.h            |   44 +-
 include/flexflow/simulator.h                  |   56 +-
 include/flexflow/utils/cuda_helper.h          |   13 +-
 include/flexflow/utils/hip_helper.h           |   33 +-
 include/flexflow/utils/memory_allocator.h     |    5 +
 .../flexflow/utils/peft_weight_allocator.h    |   92 +
 inference/MODEL_WEIGHTS.md                    |   28 -
 inference/README.md                           |   42 +
 inference/incr_decoding/incr_decoding.cc      |   11 +-
 inference/models/falcon.cc                    |   16 +-
 inference/models/llama.cc                     |  112 +-
 inference/models/llama.h                      |   11 +-
 inference/models/mpt.cc                       |   23 +-
 inference/models/opt.cc                       |   27 +-
 inference/models/starcoder.cc                 |   19 +-
 inference/peft/CMakeLists.txt                 |  139 +
 inference/peft/Makefile                       |   37 +
 inference/peft/peft.cc                        |  387 ++
 inference/peft/peft_bwd_benchmark.cc          |  391 ++
 inference/peft/peft_fwd_benchmark.cc          |  363 ++
 inference/peft/req_rate_benchmark.cc          |  518 ++
 inference/python/ff_peft.py                   |  189 +
 inference/python/incr_decoding.py             |    5 +-
 inference/python/peft_demo/INSTRUCTIONS.md    |   25 +
 inference/python/peft_demo/demo.ipynb         | 1907 +++++++
 inference/python/peft_demo/demo.py            |  240 +
 inference/python/spec_infer.py                |    7 +-
 inference/spec_infer/spec_infer.cc            |   11 +-
 inference/utils/download_peft_model.py        |   68 +
 inference/utils/upload_peft_model.py          |  142 +
 python/flexflow/core/__init__.py              |    5 +-
 python/flexflow/core/flexflow_cffi.py         | 5024 +++++++++--------
 python/flexflow/serve/__init__.py             |   43 +-
 python/flexflow/serve/models/base.py          |    3 +
 python/flexflow/serve/models/falcon.py        |   41 +-
 python/flexflow/serve/models/llama.py         |   48 +-
 python/flexflow/serve/models/mpt.py           |   46 +-
 python/flexflow/serve/models/opt.py           |   51 +-
 python/flexflow/serve/models/starcoder.py     |   47 +-
 python/flexflow/serve/serve.py                |  446 +-
 python/flexflow/type.py                       |   11 +
 rdelacou/generate_trace.py                    |  121 +
 requirements.txt                              |    8 +
 src/c/flexflow_c.cc                           |  382 +-
 src/loss_functions/loss_functions.cpp         |    8 +-
 src/loss_functions/loss_functions.cu          |    8 +-
 src/ops/add_bias_residual_layer_norm.cc       |  607 +-
 src/ops/add_bias_residual_layer_norm.cpp      |  748 ++-
 src/ops/add_bias_residual_layer_norm.cu       |  609 +-
 src/ops/aggregate.cc                          |    6 +-
 src/ops/aggregate.cpp                         |    9 +-
 src/ops/aggregate.cu                          |    7 +-
 src/ops/aggregate_spec.cc                     |    6 +-
 src/ops/aggregate_spec.cpp                    |    7 +-
 src/ops/aggregate_spec.cu                     |    7 +-
 src/ops/arg_topk.cc                           |   11 +-
 src/ops/argmax.cc                             |   42 +-
 src/ops/argmax.cpp                            |   81 +-
 src/ops/argmax.cu                             |   86 +-
 src/ops/attention.cc                          |    2 +-
 src/ops/attention.cpp                         |    2 +-
 src/ops/attention.cu                          |    2 +-
 src/ops/batch_matmul.cc                       |    4 +-
 src/ops/batch_norm.cpp                        |    2 +-
 src/ops/batch_norm.cu                         |    2 +-
 src/ops/beam_topk.cc                          |   10 +-
 src/ops/beam_topk.cpp                         |    2 +-
 src/ops/beam_topk.cu                          |    2 +-
 src/ops/cache.cc                              |    2 +-
 src/ops/cache.cpp                             |    2 +-
 src/ops/cache.cu                              |    2 +-
 src/ops/cast.cc                               |    2 +-
 src/ops/concat.cc                             |    4 +-
 src/ops/conv_2d.cc                            |   17 +-
 src/ops/element_binary.cc                     |   10 +-
 src/ops/element_unary.cc                      |    4 +-
 src/ops/element_unary.cpp                     |    3 +-
 src/ops/element_unary.cu                      |    3 +-
 src/ops/embedding.cc                          |   18 +-
 src/ops/experts.cc                            |   17 +-
 src/ops/experts.cpp                           |   30 +-
 src/ops/experts.cu                            |   65 +-
 src/ops/flat.cc                               |    3 +-
 src/ops/fused.cc                              |  234 +-
 src/ops/fused.cpp                             | 1257 +++--
 src/ops/fused.cu                              | 1410 +++--
 src/ops/group_by.cc                           |    6 +-
 src/ops/group_by.cpp                          |    6 +-
 src/ops/group_by.cu                           |    6 +-
 src/ops/inc_multihead_self_attention.cc       |  139 +-
 src/ops/inc_multihead_self_attention.cpp      | 1782 ++++--
 src/ops/inc_multihead_self_attention.cu       |  756 ++-
 src/ops/kernels/batch_matmul.cpp              |    4 +-
 src/ops/kernels/batch_matmul.cu               |    4 +-
 src/ops/kernels/cast_kernels.cpp              |    3 +-
 src/ops/kernels/cast_kernels.cu               |    3 +-
 src/ops/kernels/concat_kernels.cpp            |    4 +
 src/ops/kernels/concat_kernels.cu             |    4 +
 src/ops/kernels/conv_2d_kernels.cpp           |   10 +-
 src/ops/kernels/conv_2d_kernels.cu            |   10 +-
 src/ops/kernels/dropout_kernels.cpp           |    2 +-
 src/ops/kernels/dropout_kernels.cu            |    2 +-
 src/ops/kernels/flat_kernels.cpp              |    4 +
 src/ops/kernels/flat_kernels.cu               |    4 +
 src/ops/kernels/linear_kernels.cpp            |  423 +-
 src/ops/kernels/linear_kernels.cu             |  268 +-
 src/ops/kernels/lora_linear_kernels.cpp       |  576 ++
 src/ops/kernels/lora_linear_kernels.cu        |  579 ++
 src/ops/kernels/pool_2d_kernels.cpp           |    4 +-
 src/ops/kernels/pool_2d_kernels.cu            |    4 +-
 src/ops/kernels/reshape_kernels.cpp           |    4 +-
 src/ops/kernels/reshape_kernels.cu            |    4 +-
 src/ops/kernels/residual_rms_norm_kernels.cpp |  438 +-
 src/ops/kernels/residual_rms_norm_kernels.cu  |  454 +-
 src/ops/kernels/rms_norm_kernels.cpp          |  396 +-
 src/ops/kernels/rms_norm_kernels.cu           |  444 +-
 src/ops/kernels/softmax.cpp                   |  284 +-
 src/ops/kernels/softmax.cu                    |  275 +-
 src/ops/kernels/transpose_kernels.cpp         |    4 +
 src/ops/kernels/transpose_kernels.cu          |    4 +
 src/ops/layer_norm.cc                         |  181 +-
 src/ops/layer_norm.cpp                        |  479 +-
 src/ops/layer_norm.cu                         |  352 +-
 src/ops/linear.cc                             |  154 +-
 src/ops/lora_linear.cc                        | 1316 +++++
 src/ops/lora_linear_params.cc                 |  221 +
 src/ops/mean.cc                               |    3 +-
 src/ops/noop.cc                               |    7 +-
 src/ops/pool_2d.cc                            |    4 +-
 src/ops/reduce.cc                             |    2 +-
 src/ops/reduce.cpp                            |    2 +-
 src/ops/reduce.cu                             |    2 +-
 src/ops/reshape.cc                            |    4 +-
 src/ops/residual_layer_norm.cc                |  521 +-
 src/ops/residual_layer_norm.cpp               |  695 ++-
 src/ops/residual_layer_norm.cu                |  690 ++-
 src/ops/residual_rms_norm.cc                  |  512 +-
 src/ops/rms_norm.cc                           |  168 +-
 src/ops/sampling.cc                           |    6 +-
 src/ops/sigmoid_silu_multi.cc                 |  187 +-
 src/ops/sigmoid_silu_multi.cpp                |  297 +-
 src/ops/sigmoid_silu_multi.cu                 |  264 +-
 src/ops/softmax.cc                            |  261 +-
 src/ops/spec_inc_multihead_self_attention.cc  |    2 +-
 src/ops/spec_inc_multihead_self_attention.cpp |   17 +-
 src/ops/spec_inc_multihead_self_attention.cu  |   66 +-
 src/ops/split.cc                              |    2 +-
 src/ops/topk.cc                               |    6 +-
 src/ops/topk.cpp                              |    3 +-
 src/ops/topk.cu                               |    3 +-
 src/ops/transpose.cc                          |    6 +-
 src/ops/tree_inc_multihead_self_attention.cc  |    2 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  654 ++-
 src/ops/tree_inc_multihead_self_attention.cu  |   58 +-
 src/parallel_ops/allreduce.cc                 |  287 +-
 src/parallel_ops/combine.cc                   |  151 +-
 src/parallel_ops/fused_parallel_op.cc         |    2 +-
 .../kernels/allreduce_kernels.cpp             |   52 +-
 src/parallel_ops/kernels/allreduce_kernels.cu |   48 +-
 src/parallel_ops/kernels/combine_kernels.cpp  |    4 +-
 src/parallel_ops/kernels/combine_kernels.cu   |    4 +-
 .../kernels/parallel_identity_kernels.cpp     |   97 +
 .../kernels/parallel_identity_kernels.cu      |   96 +
 .../kernels/partition_kernels.cpp             |    4 +-
 src/parallel_ops/kernels/partition_kernels.cu |    4 +-
 .../kernels/reduction_kernels.cpp             |    2 +-
 src/parallel_ops/kernels/reduction_kernels.cu |    2 +-
 .../kernels/replicate_kernels.cpp             |    2 +-
 src/parallel_ops/kernels/replicate_kernels.cu |    2 +-
 src/parallel_ops/parallel_identity.cc         |  474 ++
 src/parallel_ops/partition.cc                 |   10 +-
 src/parallel_ops/reduction.cc                 |   17 +-
 src/parallel_ops/replicate.cc                 |   91 +-
 src/runtime/batch_config.cc                   |   65 +-
 src/runtime/beam_search_batch_config.cc       |    4 +
 src/runtime/cuda_helper.cu                    |  200 +-
 src/runtime/ffconst_utils.cc                  |    5 +
 src/runtime/fftype.cc                         |   25 +
 src/runtime/file_loader.cc                    |   92 +-
 src/runtime/graph.cc                          |   88 +-
 src/runtime/hip_helper.cpp                    |  274 +-
 src/runtime/inference_manager.cc              |  204 +-
 src/runtime/model.cc                          |  840 ++-
 src/runtime/model.cpp                         |    4 +-
 src/runtime/model.cu                          |   45 +-
 src/runtime/operator.cc                       |   36 +-
 src/runtime/operator_params.cc                |    3 +
 src/runtime/request_manager.cc                |  768 ++-
 src/runtime/request_manager.cpp               |   45 +-
 src/runtime/request_manager.cu                |  123 +-
 src/runtime/simulator.cpp                     |   22 +-
 src/runtime/simulator.cu                      |   26 +-
 src/runtime/substitution.cc                   |   36 +-
 src/runtime/tree_verify_batch_config.cc       |    4 +
 tests/.gitignore                              |    1 -
 tests/align/test_all_operators.sh             |    2 +-
 tests/cpp_gpu_tests.sh                        |    4 +-
 tests/inference/cpp_inference_tests.sh        |   64 +-
 tests/inference/huggingface_inference.py      |   14 +-
 tests/inference/python_inference_tests.sh     |   35 +-
 .../python_test_configs/generate_configs.py   |    5 +-
 tests/peft/alignment/align_test_utils.py      |  510 ++
 .../alignment/llama_alignment_tests.ipynb     | 2651 +++++++++
 .../peft/alignment/opt_alignment_tests.ipynb  |  450 ++
 tests/peft/hf_finetune.py                     |  129 +
 tests/peft/hf_serve.py                        |  140 +
 tests/peft/hf_train.py                        |  161 +
 tests/peft/hf_utils.py                        |  352 ++
 tests/peft/peft_alignment_test.py             |  730 +++
 tests/peft_test.sh                            |   66 +
 285 files changed, 35212 insertions(+), 6650 deletions(-)
 create mode 100644 include/flexflow/ops/kernels/lora_linear_kernels.h
 create mode 100644 include/flexflow/ops/lora_linear.h
 create mode 100644 include/flexflow/ops/lora_linear_params.h
 create mode 100644 include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h
 create mode 100644 include/flexflow/parallel_ops/parallel_identity.h
 create mode 100644 include/flexflow/parallel_ops/parallel_identity_params.h
 create mode 100644 include/flexflow/utils/peft_weight_allocator.h
 delete mode 100644 inference/MODEL_WEIGHTS.md
 create mode 100644 inference/README.md
 create mode 100644 inference/peft/CMakeLists.txt
 create mode 100644 inference/peft/Makefile
 create mode 100644 inference/peft/peft.cc
 create mode 100644 inference/peft/peft_bwd_benchmark.cc
 create mode 100644 inference/peft/peft_fwd_benchmark.cc
 create mode 100644 inference/peft/req_rate_benchmark.cc
 create mode 100644 inference/python/ff_peft.py
 create mode 100644 inference/python/peft_demo/INSTRUCTIONS.md
 create mode 100644 inference/python/peft_demo/demo.ipynb
 create mode 100644 inference/python/peft_demo/demo.py
 create mode 100644 inference/utils/download_peft_model.py
 create mode 100644 inference/utils/upload_peft_model.py
 create mode 100644 rdelacou/generate_trace.py
 create mode 100644 src/ops/kernels/lora_linear_kernels.cpp
 create mode 100644 src/ops/kernels/lora_linear_kernels.cu
 create mode 100644 src/ops/lora_linear.cc
 create mode 100644 src/ops/lora_linear_params.cc
 create mode 100644 src/parallel_ops/kernels/parallel_identity_kernels.cpp
 create mode 100644 src/parallel_ops/kernels/parallel_identity_kernels.cu
 create mode 100644 src/parallel_ops/parallel_identity.cc
 delete mode 100644 tests/.gitignore
 create mode 100644 tests/peft/alignment/align_test_utils.py
 create mode 100644 tests/peft/alignment/llama_alignment_tests.ipynb
 create mode 100644 tests/peft/alignment/opt_alignment_tests.ipynb
 create mode 100644 tests/peft/hf_finetune.py
 create mode 100644 tests/peft/hf_serve.py
 create mode 100644 tests/peft/hf_train.py
 create mode 100644 tests/peft/hf_utils.py
 create mode 100644 tests/peft/peft_alignment_test.py
 create mode 100755 tests/peft_test.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d05856f1a9..ef5961bc87 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -52,13 +52,14 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         if: ${{ matrix.gpu_backend == 'cuda' }}
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
+          log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -156,11 +157,12 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           use-github-cache: "false"
+          log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -169,7 +171,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           activate-environment: flexflow
-          environment-file: conda/environment.yml
+          environment-file: conda/flexflow.yml
           auto-activate-base: false
 
       - name: Build FlexFlow
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index c7d0cd72cb..00ca2df603 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -181,6 +181,16 @@ jobs:
           ../config/config.linux
           make -j
 
+      - name: Run PEFT tests
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export CUDNN_DIR=/usr/local/cuda
+          export CUDA_DIR=/usr/local/cuda
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+          source ./build/set_python_envs.sh
+          ./tests/peft_test.sh
+      
       - name: Run inference tests
         env:
           CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
index 7c11a4a420..73b8e88418 100755
--- a/.github/workflows/helpers/install_cudnn.sh
+++ b/.github/workflows/helpers/install_cudnn.sh
@@ -5,8 +5,11 @@ set -x
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+
 # Install CUDNN
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
 cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
 echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
 CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
@@ -44,8 +47,11 @@ elif [[ "$cuda_version" == "11.7" ]]; then
 elif [[ "$cuda_version" == "11.8" ]]; then
     CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
     CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-elif [[ "$cuda_version" == "12.0" ]]; then
-    echo "CUDNN support for CUDA version 12.0 not yet added"
+elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
+    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+    CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+else
+    echo "CUDNN support for CUDA version above 12.5 not yet added"
     exit 1
 fi
 wget -c -q $CUDNN_LINK
@@ -55,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version"
     sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
     sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
     rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
+elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
+    wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+    sudo dpkg -i cuda-keyring_1.1-1_all.deb
+    sudo apt update -y
+    rm -f cuda-keyring_1.1-1_all.deb
+    sudo dpkg -i $CUDNN_TARBALL_NAME
+    sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
+    sudo apt update -y
+    sudo apt install -y libcudnn8
+    sudo apt install -y libcudnn8-dev
+    sudo apt install -y libcudnn8-samples
 else
     sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
 fi
diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh
index ca88668d84..ae6793ea2a 100755
--- a/.github/workflows/helpers/install_nccl.sh
+++ b/.github/workflows/helpers/install_nccl.sh
@@ -8,13 +8,13 @@ cd "${BASH_SOURCE[0]%/*}"
 # Add NCCL key ring
 ubuntu_version=$(lsb_release -rs)
 ubuntu_version=${ubuntu_version//./}
-wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"
-sudo dpkg -i cuda-keyring_1.0-1_all.deb
+wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
 sudo apt update -y
-rm -f cuda-keyring_1.0-1_all.deb
+rm -f cuda-keyring_1.1-1_all.deb
 
 # Install NCCL
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
 cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
 echo "Installing NCCL for CUDA version: ${cuda_version} ..."
 
diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
index 226f953b38..2fc527bf08 100644
--- a/.github/workflows/multinode-test.yml
+++ b/.github/workflows/multinode-test.yml
@@ -38,7 +38,7 @@ jobs:
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -87,7 +87,7 @@ jobs:
     runs-on: self-hosted
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
@@ -138,7 +138,7 @@ jobs:
     runs-on: self-hosted
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
index 3562134987..d5acbfc2e1 100644
--- a/.github/workflows/pip-install.yml
+++ b/.github/workflows/pip-install.yml
@@ -44,10 +44,10 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
 
diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml
index 267daaee6b..633fb00eb8 100644
--- a/.github/workflows/prebuild-legion.yml
+++ b/.github/workflows/prebuild-legion.yml
@@ -23,13 +23,13 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
-        gpu_backend_version: ["11.8", "5.6"]
+        gpu_backend_version: ["12.0", "5.6"]
         python_version: ["3.11"]
         exclude:
           - gpu_backend: "cuda"
             gpu_backend_version: "5.6"
           - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.8"
+            gpu_backend_version: "12.0"
       fail-fast: false
     steps:
       - name: Checkout Git Repository
diff --git a/.gitignore b/.gitignore
index 7f6a3c4137..cc34c1a7b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -187,4 +187,9 @@ gpt_tokenizer
 python/flexflow/version.txt
 
 inference_tensors
+hf_peft_tensors
+lora_training_logs
+
+Untitled-1.ipynb
+Untitled-2.ipynb
 tests/inference/python_test_configs/*.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c82a53644e..f06969ae04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -567,6 +567,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index 67ef6b3419..091ba929e4 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -25,3 +25,10 @@ dependencies:
     - sentencepiece
     - einops
     - requests
+    - scipy
+    - bitsandbytes
+    - datasets
+    - accelerate
+    - loralib
+    - triton
+    - peft
diff --git a/config/config.inc b/config/config.inc
index 7d7b2db9cf..6431eaf136 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -197,7 +197,7 @@ fi
 
 # set ROCM path
 if [ -n "$ROCM_PATH" ]; then
-  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
+  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}"
 fi
 
 ADD_ROCM_TO_PATH=""
diff --git a/docker/build.sh b/docker/build.sh
index 8ecacbc6d4..b68860712f 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -56,15 +56,14 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     cuda_version_input=${cuda_version}.3
   elif [[ "$cuda_version" == @(11.8) ]]; then 
     cuda_version_input=${cuda_version}.0
+  elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+    # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
+    cuda_version=12.2
+    cuda_version_input=${cuda_version}.2
   else
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
-  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
-  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.2
-    cuda_version_input=${cuda_version}.2
-  fi
   echo "Building $image docker image with CUDA $cuda_version"
   ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
   gpu_backend_version="-${cuda_version}"
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index cef619ad68..3434916d6b 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -94,6 +94,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
 RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
diff --git a/docker/run.sh b/docker/run.sh
index 666c8e1121..cf105a10c8 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -58,7 +58,7 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     fi
   fi
   # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 009d1c250a..873fed0bdb 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
@@ -36,6 +37,18 @@ using BeamSearchBatchConfigFuture = Legion::Future;
 using TreeVerifyBatchConfigFuture = Legion::Future;
 using BeamInferenceResultFuture = Legion::Future;
 
+struct OptimizerTasks {
+  bool compute_gradients = true;
+  bool reset_gradients_to_zero = false;
+  bool update_weights = false;
+  bool save_updated_weights = false;
+};
+
+void set_optimizer_tasks(OptimizerTasks &tasks,
+                         int max_training_steps,
+                         int completed_training_steps,
+                         int gradient_accumulation_steps);
+
 class BatchConfig {
 public:
   using RequestGuid = size_t;
@@ -43,6 +56,8 @@ class BatchConfig {
   BatchConfig();
   int num_active_requests() const;
   int num_active_tokens() const;
+  int num_active_infr_tokens() const;
+  int num_active_peft_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
@@ -56,26 +71,43 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
+  static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
   //  Set by update
-  int num_tokens;
+
+  int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
   // number of tokens in prompt phase, start offset of tokens in inc_decoding
   // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-  int num_generation_tokens;
+  int num_generation_tokens = 0;
 
   struct PerRequestInfo {
+    PerRequestInfo() {
+      first_token_depth_in_request = 0;
+      first_token_offset_in_batch = 0;
+      num_tokens_in_batch = 0;
+      max_sequence_length = 0;
+      request_guid = 0;
+      prompt_phase = false;
+      batch_config_request_id = -1;
+      peft_model_id = PEFTModelID::NO_ID;
+      peft_bwd = false;
+      optimizer_tasks = {true, false, false, false};
+    }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
     int max_sequence_length;
 
     // request id in batch config:
-    int batch_config_request_id;
+    int batch_config_request_id = -1;
     bool prompt_phase = false;
     RequestGuid request_guid;
+    // PEFT fields
+    PEFTModelID peft_model_id;
+    bool peft_bwd;
+    OptimizerTasks optimizer_tasks;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;
@@ -102,6 +134,7 @@ class BatchConfig {
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+  PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
   bool request_running[MAX_NUM_REQUESTS];
@@ -129,6 +162,7 @@ class TreeVerifyBatchConfig : public BatchConfig {
 struct InferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
+  float finetuning_loss;
 };
 
 class BeamSearchBatchConfig : public BatchConfig {
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 2c11ae1131..dd9d657117 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -65,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 #endif
 
 class FFConfig;
+class MemoryAllocator;
+class PEFTWeightAllocator;
+
+struct CombinedBatchConfigMetaStruct {
+  BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS];
+  BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS];
+  BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS];
+  bool request_completed[BatchConfig::MAX_NUM_REQUESTS];
+
+  BeamSearchBatchConfig::BeamSearchPerTokenInfo
+      beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS +
+                    BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM *
+                        BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+  BeamSearchBatchConfig::BeamSearchPerRequestInfo
+      beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+
+  TreeVerifyBatchConfig::CommittedTokensInfo
+      committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS];
+};
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -76,18 +95,18 @@ struct FFHandler {
 #endif
   void *workSpace;
   size_t workSpaceSize;
-  void *batch_config_metadata;
+  CombinedBatchConfigMetaStruct *batch_config_metadata;
 
   // request info + token info + topolopgy mask info
-  size_t batch_config_metadata_size =
-      sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-      sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-      sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_completed);
+  size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
+  // PEFT related fields
+  MemoryAllocator *peft_activation_allocator;
+  size_t peft_activation_reserve_space_size;
+  PEFTWeightAllocator *peft_weight_allocator;
+  size_t peft_weight_reserve_space_size;
+  // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
@@ -98,6 +117,8 @@ struct FFHandler {
 struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -155,6 +176,10 @@ class FFConfig {
   bool cpu_offload;
   size_t offload_reserve_space_size;
   DataType quantization_type;
+  // PEFT related fields
+  bool enable_peft;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 512645e624..24b722c36f 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -46,6 +46,12 @@ enum LossType {
   LOSS_IDENTITY = 54,
 };
 
+enum OptimizerType {
+  OPTIMIZER_TYPE_NONE = 60,
+  OPTIMIZER_TYPE_SGD = 61,
+  OPTIMIZER_TYPE_ADAM = 62,
+};
+
 enum CompMode {
   COMP_MODE_TRAINING = 70,
   COMP_MODE_INFERENCE = 71,
@@ -72,6 +78,11 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
+enum RequestType {
+  REQ_INFERENCE = 4001,
+  REQ_FINETUNING = 4002,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
@@ -172,6 +183,8 @@ enum OperatorType {
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
+  // PEFT Ops
+  OP_LORA,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
@@ -179,6 +192,7 @@ enum OperatorType {
   OP_REDUCTION,
   OP_PIPELINE,
   OP_ALLREDUCE,
+  OP_PARALLEL_IDENTITY,
   OP_FUSED_PARALLEL,
   OP_INVALID,
 };
@@ -193,36 +207,37 @@ enum ModelType {
 };
 
 enum PMParameter {
-  PM_OP_TYPE,            // AnyOp
-  PM_NUM_INPUTS,         // AnyOp
-  PM_NUM_OUTPUTS,        // AnyOp
-  PM_GROUP,              // Conv2D
-  PM_KERNEL_H,           // Conv2D, Pool2D
-  PM_KERNEL_W,           // Conv2D, Pool2D
-  PM_STRIDE_H,           // Conv2D, Pool2D
-  PM_STRIDE_W,           // Conv2D, Pool2D
-  PM_PADDING_H,          // Conv2D, Pool2D
-  PM_PADDING_W,          // Conv2D, Pool2D
-  PM_ACTI,               // Conv2D, Pool2D
-  PM_NUMDIM,             // Concat, Transpose
-  PM_AXIS,               // Concat, Split
-  PM_PERM,               // Transpose
-  PM_OUTSHUFFLE,         // Transpose
-  PM_MERGE_GCONV_COUNT,  // MergeGConv
-  PM_AXES,               // Squeeze, Unsqueeze, Reduce*
-  PM_KEEP_DIMS,          // Reduce*
-  PM_EPSILON,            // BatchNorm
-  PM_REPARTITION_DIM,    // Repartition
-  PM_REPARTITION_DEGREE, // Repartition
-  PM_REPLICATE_DIM,      // Replicate
-  PM_REPLICATE_DEGREE,   // Replicate
-  PM_COMBINE_DIM,        // Combine
-  PM_COMBINE_DEGREE,     // Combine
-  PM_REDUCTION_DIM,      // Reduction
-  PM_REDUCTION_DEGREE,   // Reduction
-  PM_ALLREDUCE_DIM,      // AllReduce
-  PM_SOFTMAX_DIM,        // Softmax
-  PM_NUM_HEADS,          // MultiHeadAttention
+  PM_OP_TYPE,               // AnyOp
+  PM_NUM_INPUTS,            // AnyOp
+  PM_NUM_OUTPUTS,           // AnyOp
+  PM_GROUP,                 // Conv2D
+  PM_KERNEL_H,              // Conv2D, Pool2D
+  PM_KERNEL_W,              // Conv2D, Pool2D
+  PM_STRIDE_H,              // Conv2D, Pool2D
+  PM_STRIDE_W,              // Conv2D, Pool2D
+  PM_PADDING_H,             // Conv2D, Pool2D
+  PM_PADDING_W,             // Conv2D, Pool2D
+  PM_ACTI,                  // Conv2D, Pool2D
+  PM_NUMDIM,                // Concat, Transpose
+  PM_AXIS,                  // Concat, Split
+  PM_PERM,                  // Transpose
+  PM_OUTSHUFFLE,            // Transpose
+  PM_MERGE_GCONV_COUNT,     // MergeGConv
+  PM_AXES,                  // Squeeze, Unsqueeze, Reduce*
+  PM_KEEP_DIMS,             // Reduce*
+  PM_EPSILON,               // BatchNorm
+  PM_REPARTITION_DIM,       // Repartition
+  PM_REPARTITION_DEGREE,    // Repartition
+  PM_REPLICATE_DIM,         // Replicate
+  PM_REPLICATE_DEGREE,      // Replicate
+  PM_COMBINE_DIM,           // Combine
+  PM_COMBINE_DEGREE,        // Combine
+  PM_REDUCTION_DIM,         // Reduction
+  PM_REDUCTION_DEGREE,      // Reduction
+  PM_ALLREDUCE_DIM,         // AllReduce
+  PM_PARALLEL_IDENTITY_DIM, // AllReduce
+  PM_SOFTMAX_DIM,           // Softmax
+  PM_NUM_HEADS,             // MultiHeadAttention
   PM_INVALID,
   PM_PARALLEL_DIM,
   PM_PARALLEL_DEGREE,
@@ -268,5 +283,7 @@ enum {
   TENSOR_GUID_LAST_VALID = 3999999,
   PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
   NODE_GUID_FIRST_VALID = 5000000,
+  PEFT_MODEL_ID_FIRST_VALID = 6000000,
+  PEFT_MODEL_ID_LAST_VALID = 6999999
 };
 #endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 1cd90fda26..3e482b8d67 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -3,6 +3,8 @@
 
 #include "flexflow/ffconst.h"
 #include <cstddef>
+#include <functional>
+#include <iostream>
 
 namespace FlexFlow {
 
@@ -18,6 +20,29 @@ class LayerID {
   size_t id, transformer_layer_id, model_id;
 };
 
+class PEFTModelID {
+public:
+  static const PEFTModelID NO_ID;
+  PEFTModelID();
+  PEFTModelID(size_t id);
+  bool is_valid_id() const;
+  friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  PEFTModelID const &peft_model_id);
+
+public:
+  size_t id;
+};
+
 }; // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<FlexFlow::PEFTModelID> {
+  size_t operator()(FlexFlow::PEFTModelID const &n) const {
+    return n.id;
+  }
+};
+} // namespace std
+
 #endif // _FF_TYPE_H
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 0b74b7fce4..52b4b3d362 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -55,6 +55,11 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
 FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -270,6 +275,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name);
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
@@ -281,6 +287,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name);
 
 flexflow_tensor_t
@@ -565,6 +572,7 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
@@ -590,6 +598,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
 
@@ -613,11 +624,16 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
-                             char const **input_text,
-                             int max_num_chars,
-                             char **output_text,
-                             int max_seq_length,
-                             int **output_length_and_tokens);
+                             enum RequestType *request_types,
+                             char const **input_texts,
+                             char **output_texts,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
+                             int **output_length_and_tokens,
+                             int *num_finetuning_losses,
+                             float *finetuning_losses);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
 
@@ -978,6 +994,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num(
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -1036,6 +1055,113 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_);
 
+// // -----------------------------------------------------------------------
+// // LoraSGDOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_sgd_optimizer_config_t
+// flexflow_lora_sgd_optimizer_config_create(
+//     double lr, double momentum, bool nesterov, bool weight_decay);
+
+// void flexflow_lora_sgd_optimizer_config_destroy(
+//     flexflow_lora_sgd_optimizer_config_t handle_);
+
+// // -----------------------------------------------------------------------
+// // LoraAdamOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_adam_optimizer_config_t
+//     flexflow_lora_adam_optimizer_config_create(double alpha,
+//                                                double beta1,
+//                                                double beta2,
+//                                                double weight_decay,
+//                                                double epsilon);
+
+// void flexflow_lora_adam_optimizer_config_destroy(
+//     flexflow_lora_adam_optimizer_config_t handle_);
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_,
+                                       bool trainable,
+                                       bool init_lora_weights,
+                                       char const *base_model_name_or_path,
+                                       char const *precision,
+                                       int rank,
+                                       float lora_alpha,
+                                       float lora_dropout,
+                                       int num_target_modules,
+                                       char const **target_modules_,
+                                       enum OptimizerType optimizer_type,
+                                       float sgd_learning_rate,
+                                       float sgd_momentum,
+                                       bool sgd_nesterov,
+                                       float sgd_weight_decay,
+                                       float adam_alpha,
+                                       float adam_beta1,
+                                       float adam_beta2,
+                                       float adam_weight_decay,
+                                       float adam_epsilon);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_cache_folder(
+    flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_peft_model_id(
+    flexflow_lora_linear_config_t handle_);
+
+int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_alpha(
+    flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_dropout(
+    flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_trainable(
+    flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_init_lora_weights(
+    flexflow_lora_linear_config_t handle_);
+
+char const **flexflow_lora_linear_config_get_target_modules(
+    flexflow_lora_linear_config_t handle_, int *num_target_modules);
+
+char const *flexflow_lora_linear_config_get_base_model_name_or_path(
+    flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_precision(
+    flexflow_lora_linear_config_t handle_);
+
+void flexflow_lora_linear_config_set_lora_alpha(
+    flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_lora_dropout(
+    flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_trainable(
+    flexflow_lora_linear_config_t handle_, bool value);
+
+void flexflow_lora_linear_config_set_init_lora_weights(
+    flexflow_lora_linear_config_t handle_, bool value);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id();
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index f24a797ffd..ba4101c173 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -40,6 +40,7 @@ struct GenerationResult {
   std::string output_text;
   std::vector<TokenId> input_tokens;
   std::vector<TokenId> output_tokens;
+  std::vector<float> finetuning_losses;
 };
 
 #include <string>
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 69a57e4e1c..c3dbcac422 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -49,7 +49,7 @@ class Layer {
   Tensor outputs[MAX_NUM_OUTPUTS];
   Tensor inputs[MAX_NUM_INPUTS];
   Tensor weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  // bool trainable_inputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
   bool inference_debugging;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 6dda67bbfe..4ad735ef7d 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -108,19 +108,31 @@ enum TaskIDs {
   LAYERNORM_FWD_TASK_ID,
   LAYERNORM_INF_TASK_ID,
   LAYERNORM_BWD_TASK_ID,
+  LAYERNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_LAYERNORM_INIT_TASK_ID,
   RESIDUAL_LAYERNORM_INF_TASK_ID,
+  RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   SIGMOID_SILU_MULTI_INIT_TASK_ID,
   SIGMOID_SILU_MULTI_INF_TASK_ID,
+  SIGMOID_SILU_MULTI_BWD_TASK_ID,
+  SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
   LINEAR_INIT_TASK_ID,
   LINEAR_INIT_PARA_TASK_ID,
   LINEAR_INF_TASK_ID,
+  LINEAR_PEFT_BWD_TASK_ID,
   LINEAR_FWD_TASK_ID,
   LINEAR_BWD_TASK_ID,
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
+  LORA_LINEAR_INIT_TASK_ID,
+  LORA_LINEAR_REG_TASK_ID,
+  LORA_LINEAR_INF_TASK_ID,
+  LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,
   FLAT_FWD_TASK_ID,
   FLAT_BWD_TASK_ID,
@@ -128,6 +140,7 @@ enum TaskIDs {
   SOFTMAX_FWD_TASK_ID,
   SOFTMAX_BWD_TASK_ID,
   SOFTMAX_INF_TASK_ID,
+  SOFTMAX_PEFT_BWD_TASK_ID,
   CONCAT_INIT_TASK_ID,
   CONCAT_FWD_TASK_ID,
   CONCAT_BWD_TASK_ID,
@@ -163,20 +176,26 @@ enum TaskIDs {
   RMSNORM_INIT_TASK_ID,
   RMSNORM_FWD_TASK_ID,
   RMSNORM_INF_TASK_ID,
+  RMSNORM_BWD_TASK_ID,
+  RMSNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
+  RESIDUAL_RMSNORM_BWD_TASK_ID,
+  RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
   BEAM_TOPK_INIT_TASK_ID,
   BEAM_TOPK_INF_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+  INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   MSELOSS_BWD_TASK_ID,
   FUSEDOP_INIT_TASK_ID,
+  FUSEDOP_PEFT_BWD_TASK_ID,
   FUSEDOP_FWD_TASK_ID,
   FUSEDOP_BWD_TASK_ID,
   FUSEDOP_INF_TASK_ID,
@@ -224,10 +243,13 @@ enum TaskIDs {
   REPARTITION_BWD_TASK_ID,
   COMBINE_INIT_TASK_ID,
   COMBINE_FWD_TASK_ID,
+  COMBINE_INF_TASK_ID,
   COMBINE_BWD_TASK_ID,
+  COMBINE_PEFT_BWD_TASK_ID,
   REPLICATE_INIT_TASK_ID,
   REPLICATE_FWD_TASK_ID,
   REPLICATE_BWD_TASK_ID,
+  REPLICATE_PEFT_BWD_TASK_ID,
   REDUCTION_INIT_TASK_ID,
   REDUCTION_FWD_TASK_ID,
   REDUCTION_BWD_TASK_ID,
@@ -235,9 +257,15 @@ enum TaskIDs {
   PIPELINE_FWD_TASK_ID,
   PIPELINE_BWD_TASK_ID,
   ALLREDUCE_INIT_TASK_ID,
-  ALLREDUCE_INF_TASK_ID,
   ALLREDUCE_FWD_TASK_ID,
   ALLREDUCE_BWD_TASK_ID,
+  ALLREDUCE_INF_TASK_ID,
+  ALLREDUCE_PEFT_BWD_TASK_ID,
+  PARALLEL_IDENTITY_INIT_TASK_ID,
+  PARALLEL_IDENTITY_FWD_TASK_ID,
+  PARALLEL_IDENTITY_BWD_TASK_ID,
+  PARALLEL_IDENTITY_INF_TASK_ID,
+  PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
   FUSED_PARALLELOP_INIT_TASK_ID,
   FUSED_PARALLELOP_FWD_TASK_ID,
   FUSED_PARALLELOP_BWD_TASK_ID,
@@ -327,6 +355,7 @@ class ResidualLayerNorm;
 class AddBiasResidualLayerNorm;
 class SigmoidSiluMulti;
 class Linear;
+class LoraLinear;
 class MultiHeadAttention;
 class IncMultiHeadSelfAttention;
 class TreeIncMultiHeadSelfAttention;
@@ -349,9 +378,12 @@ class Repartition;
 class Reduction;
 class Replicate;
 class AllReduce;
+class ParallelIdentity;
 class FusedParallelOp;
 class ParallelOpInfo;
 
+struct Request;
+
 // TODO: Move to an appropriate place
 /*
   This is used to create a type that recursively replaces value type
@@ -561,6 +593,7 @@ class FFModel {
                            bool elementwise_affine,
                            float eps,
                            bool use_bias = true,
+                           bool inplace_residual = false,
                            DataType data_type = DT_NONE,
                            char const *name = NULL);
   // Add a add_bias_residual_layer_norm layer
@@ -571,6 +604,7 @@ class FFModel {
                                     bool elementwise_affine,
                                     float eps,
                                     bool use_bias = true,
+                                    bool inplace_residual = false,
                                     DataType data_type = DT_NONE,
                                     char const *name = NULL);
   // Add a sigmoid_silu_multi layer
@@ -599,6 +633,7 @@ class FFModel {
                          Tensor *outputs,
                          float eps,
                          int dim,
+                         bool inplace_residual = false,
                          DataType data_type = DT_NONE,
                          char const *name = NULL);
   // Add a beam search top k layer
@@ -808,10 +843,13 @@ class FFModel {
       bool position_bias = false,
       char const *name = NULL);
   // ========================================
+  // PEFT Layers
+  // ========================================
+  PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+  // ========================================
   // Inference APIs
   // ========================================
-  std::vector<GenerationResult> generate(std::vector<std::string> &prompts,
-                                         int max_seq_length);
+  std::vector<GenerationResult> generate(std::vector<Request> const &requests);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
@@ -1103,6 +1141,9 @@ class FFModel {
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
+  bool need_to_add_combine(int layer_idx) const;
+  bool need_to_add_allreduce(int layer_idx) const;
+  bool need_to_add_parallel_identity(int layer_idx) const;
   bool is_mlp_block(int layer_idx) const;
   void create_operators_from_layers();
   Op *create_operator_from_layer(Layer *layer,
@@ -1117,7 +1158,7 @@ class FFModel {
   void clear_graph_search_cache();
 
 public:
-  size_t op_global_guid, layer_global_guid;
+  size_t op_global_guid, layer_global_guid, peft_model_global_guid;
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
@@ -1137,6 +1178,12 @@ class FFModel {
   std::vector<Layer *> layers;
   std::vector<Op *> operators;
   std::vector<ParallelTensor> parameters;
+  // PEFT related
+  std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
+  std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  //   std::vector<Op *> peft_operators;
+
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
   // Cached operators: key: operator hash, value: operator pointer
@@ -1195,6 +1242,10 @@ class FFModel {
           SigmoidSiluMulti *>,
       std::unordered_map<std::pair<ParallelTensorShape, LinearParams>,
                          Linear *>,
+      std::unordered_map<
+          std::pair<std::pair<ParallelTensorShape, ParallelTensorShape>,
+                    LoraLinearParams>,
+          LoraLinear *>,
       std::unordered_map<std::pair<ParallelTensorShape, Pool2DParams>,
                          Pool2D *>,
       std::unordered_map<std::pair<std::tuple<ParallelTensorShape,
@@ -1245,6 +1296,8 @@ class FFModel {
                          Combine *>,
       std::unordered_map<std::pair<ParallelTensorShape, AllReduceParams>,
                          AllReduce *>,
+      std::unordered_map<std::pair<ParallelTensorShape, ParallelIdentityParams>,
+                         ParallelIdentity *>,
       std::unordered_map<std::pair<ParallelTensorShape, FusedParallelOpParams>,
                          FusedParallelOp *>>
       cached_ops;
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index 60785a1e29..d31c12b16c 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -9,7 +9,7 @@ class Op;
 
 class OpMeta {
 public:
-  OpMeta(FFHandler _handle);
+  // OpMeta(FFHandler _handle);
   OpMeta(FFHandler _handle, Op const *op);
 
 public:
@@ -17,9 +17,11 @@ class OpMeta {
   bool profiling; // Measure the run time of the task
   bool inference_debugging;
   int decoding_step;
+  int bwd_step;
   char op_name[MAX_OPNAME];
   LayerID layer_guid;
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   DataType input_type[MAX_NUM_INPUTS];
   DataType weight_type[MAX_NUM_WEIGHTS];
   DataType output_type[MAX_NUM_OUTPUTS];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 311699d926..1a5af67b36 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -7,7 +7,9 @@
 #include "flexflow/machine_view.h"
 #include "flexflow/parallel_tensor.h"
 #include "flexflow/utils/dot/record_formatter.h"
+#include <filesystem>
 #include <vector>
+namespace fs = std::filesystem;
 
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -29,6 +31,11 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT };
 
 enum class MappingOperation { PARTITION, REPLICATE };
 
+fs::path get_dst_folder(std::string const &subdir,
+                        int step_idx = 0,
+                        int shard_idx = 0,
+                        bool before_kernel = false);
+
 /** @brief  A class to keep track of a dimension relation between two tensors
  * used by an operator.
  *
@@ -236,11 +243,18 @@ class Op {
     Legion::FutureMap empty_map;
     return empty_map;
   };
+  virtual Legion::FutureMap peft_bwd(FFModel const &,
+                                     BatchConfigFuture const &,
+                                     std::vector<ParallelTensor> const &,
+                                     std::vector<ParallelTensor> const &,
+                                     MachineView const *mv = nullptr) {
+    assert(false);
+  }
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
   static std::string get_op_name_without_uid(OpMetaType *m) {
     std::string op_name_without_uid = std::string(m->op_name);
-    size_t last_underscore = op_name_without_uid.length() - 1;
+    size_t last_underscore = op_name_without_uid.length();
     for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
       if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
         break;
@@ -248,7 +262,9 @@ class Op {
         last_underscore = i;
       }
     }
-    op_name_without_uid.erase(last_underscore);
+    if (last_underscore < op_name_without_uid.length()) {
+      op_name_without_uid.erase(last_underscore);
+    }
     return op_name_without_uid;
   }
   template <typename OpMetaType>
@@ -259,31 +275,42 @@ class Op {
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
       std::vector<GenericTensorAccessorR> output_tensors,
+      bool fwd_pass = true,
       bool before_kernel = false) {
-    // Check if output directory exists, and create it if it does not
-    char const *folder_path = "./inference_tensors";
-    struct stat st = {0};
-    if (stat(folder_path, &st) == -1) {
-      // Directory does not exist, create it
-      mkdir(folder_path, 0700);
-    }
-    // output base filepath, shared by all tensors from the same operator
+    // get operator name and print it
     std::string op_name_without_uid = get_op_name_without_uid(m);
-    std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
-        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        op_name_without_uid + "_shard-id_" + std::to_string(shard_id);
-    if (before_kernel) {
-      base_filepath += "_pre";
+    std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
+              << std::endl;
+    // build the path to save the tensor
+    fs::path dst_filepath;
+    if (fwd_pass) {
+      dst_filepath =
+          get_dst_folder("fwd", m->decoding_step, shard_id, before_kernel);
+    } else {
+      dst_filepath =
+          get_dst_folder("bwd", m->bwd_step, shard_id, before_kernel);
+    }
+    if (m->layer_guid.model_id > 0) {
+      assert(false && "Model ID > 0 not supported yet");
     }
+    std::string layername = "layers." +
+                            std::to_string(m->layer_guid.transformer_layer_id) +
+                            "." + op_name_without_uid;
+    dst_filepath /= layername;
+
     // save batch config, if passed
     if (bc != nullptr) {
-      bc->save_to_file(base_filepath + "_batch-config");
+      bc->save_to_file(dst_filepath.string() + ".batch_config");
     }
+
     // save all inputs
     for (int i = 0; i < input_tensors.size(); i++) {
-      std::string filename = base_filepath + "_input_" + std::to_string(i);
+      std::string filename = dst_filepath.string() + ".input_";
+      if (fwd_pass) {
+        filename += std::to_string(i);
+      } else {
+        filename += "gradient_" + std::to_string(i);
+      }
       if (input_tensors[i].data_type == DT_FLOAT) {
         save_tensor(input_tensors[i].get_float_ptr(),
                     input_tensors[i].domain.get_volume(),
@@ -304,10 +331,17 @@ class Op {
         assert(false && "Tensor data type not supported");
       }
     }
-    // only dump the weights once
-    if (m->decoding_step == 0) {
+
+    // only dump the weights in the forward pass, at the first step
+    // note that we do not save the weight gradients, since we only support
+    // finetuning LoRA weights, which are not FF tensors.
+    if (fwd_pass && m->decoding_step == 0) {
+      fs::path dst_filepath_weights =
+          get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
+          layername;
       for (int i = 0; i < weight_tensors.size(); i++) {
-        std::string filename = base_filepath + "_weight_" + std::to_string(i);
+        std::string filename =
+            dst_filepath_weights.string() + ".weight_" + std::to_string(i);
         if (weight_tensors[i].data_type == DT_FLOAT) {
           save_tensor(weight_tensors[i].get_float_ptr(),
                       weight_tensors[i].domain.get_volume(),
@@ -329,9 +363,15 @@ class Op {
         }
       }
     }
+
     // save all outputs
     for (int i = 0; i < output_tensors.size(); i++) {
-      std::string filename = base_filepath + "_output_" + std::to_string(i);
+      std::string filename = dst_filepath.string() + ".output_";
+      if (fwd_pass) {
+        filename += std::to_string(i);
+      } else {
+        filename += "gradient_" + std::to_string(i);
+      }
       if (output_tensors[i].data_type == DT_FLOAT) {
         save_tensor(output_tensors[i].get_float_ptr(),
                     output_tensors[i].domain.get_volume(),
@@ -354,7 +394,11 @@ class Op {
     }
     // increase count of decoding steps
     if (!before_kernel) {
-      m->decoding_step++;
+      if (fwd_pass) {
+        m->decoding_step++;
+      } else {
+        m->bwd_step++;
+      }
     }
   }
   virtual bool measure_operator_cost(Simulator *sim,
@@ -448,7 +492,8 @@ class Op {
   ParallelTensor outputs[MAX_NUM_OUTPUTS];
   ParallelTensor inputs[MAX_NUM_INPUTS];
   ParallelParameter weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   OpMeta *meta[MAX_NUM_WORKERS];
   std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
   int numInputs, numWeights, numOutputs;
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 5b187839ef..673f78ad46 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -23,6 +23,7 @@
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
+#include "flexflow/ops/lora_linear_params.h"
 #include "flexflow/ops/pool_2d_params.h"
 #include "flexflow/ops/reduce_params.h"
 #include "flexflow/ops/reshape_params.h"
@@ -40,6 +41,7 @@
 #include "flexflow/parallel_ops/allreduce_params.h"
 #include "flexflow/parallel_ops/combine_params.h"
 #include "flexflow/parallel_ops/fused_parallel_op_params.h"
+#include "flexflow/parallel_ops/parallel_identity_params.h"
 #include "flexflow/parallel_ops/partition_params.h"
 #include "flexflow/parallel_ops/reduction_params.h"
 #include "flexflow/parallel_ops/replicate_params.h"
@@ -67,6 +69,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        AddBiasResidualLayerNormParams,
                                        SigmoidSiluMultiParams,
                                        LinearParams,
+                                       LoraLinearParams,
                                        MultiHeadAttentionParams,
                                        IncMultiHeadSelfAttentionParams,
                                        BeamTopKParams,
@@ -89,6 +92,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        ReductionParams,
                                        CombineParams,
                                        AllReduceParams,
+                                       ParallelIdentityParams,
                                        FusedParallelOpParams>;
 
 tl::optional<OperatorParameters> get_op_parameters(Op const *op);
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index bb470376c3..9510ac0f28 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -24,8 +24,10 @@ class AddBiasResidualLayerNorm : public Op {
                            bool _elementwise_affine,
                            bool _use_bias,
                            float _eps,
+                           bool _inplace_residual,
                            bool allocate_weights,
                            char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
@@ -38,6 +40,11 @@ class AddBiasResidualLayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -61,6 +68,14 @@ class AddBiasResidualLayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -76,21 +91,55 @@ class AddBiasResidualLayerNorm : public Op {
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
-                                       int attn_bias_dim,
-                                       int residual_volume,
+  static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorR const &attn_bias,
+                                       GenericTensorAccessorR const &residual,
                                        GenericTensorAccessorW &added_output,
                                        GenericTensorAccessorW &output,
-                                       GenericTensorAccessorR const &residual,
-                                       GenericTensorAccessorR const &attn_bias,
                                        GenericTensorAccessorR const &gamma,
                                        GenericTensorAccessorR const &beta);
+  template <typename T>
+  static void backward_kernel(AddBiasResidualLayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T const *added_output_ptr,
+                              T *input_grad_ptr,
+                              T *residual_grad_ptr,
+                              T *attn_bias_grad_ptr,
+                              T const *gamma_ptr,
+                              T *gamma_grad_ptr,
+                              T *beta_grad_ptr,
+                              ffStream_t stream);
+  static void
+      backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR &added_output,
+                              GenericTensorAccessorW &input_grad,
+                              GenericTensorAccessorW const &residual_grad,
+                              GenericTensorAccessorW const &attn_bias_grad,
+                              GenericTensorAccessorR const &gamma,
+                              GenericTensorAccessorW const &gamma_grad,
+                              GenericTensorAccessorW const &beta_grad);
+  template <typename T>
+  static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T *input_grad_ptr,
+                              T *residual_grad_ptr,
+                              T const *gamma_ptr,
+                              ffStream_t stream);
+  static void
+      peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW &input_grad,
+                              GenericTensorAccessorW const &residual_grad,
+                              GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   std::vector<int> axes;
 };
 
@@ -105,8 +154,12 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
   bool elementwise_affine, use_bias;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
index 87fe2fb562..840f521b01 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams {
   bool elementwise_affine;
   float eps;
   bool use_bias;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h
index 3ba4f414d1..283e9a4290 100644
--- a/include/flexflow/ops/aggregate.h
+++ b/include/flexflow/ops/aggregate.h
@@ -11,9 +11,11 @@ namespace FlexFlow {
 #define AGGREGATE_MAX_BATCH_SIZE 64
 #define AGGREGATE_MAX_N 128
 
+class Aggregate;
+
 class AggregateMeta : public OpMeta {
 public:
-  AggregateMeta(FFHandler handle, int n);
+  AggregateMeta(FFHandler handle, Aggregate const *aggr);
   ~AggregateMeta(void);
   float **dev_exp_preds;
   float **dev_exp_grads;
diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h
index 4302dd0733..a9f651b620 100644
--- a/include/flexflow/ops/aggregate_spec.h
+++ b/include/flexflow/ops/aggregate_spec.h
@@ -11,9 +11,11 @@ namespace FlexFlow {
 #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32
 #define AGGREGATE_SPEC_MAX_N 12
 
+class AggregateSpec;
+
 class AggregateSpecMeta : public OpMeta {
 public:
-  AggregateSpecMeta(FFHandler handle, int n);
+  AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg);
   ~AggregateSpecMeta(void);
   float **dev_region_ptrs;
 };
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
index 298059e3ed..eca9943d20 100644
--- a/include/flexflow/ops/argmax.h
+++ b/include/flexflow/ops/argmax.h
@@ -17,6 +17,7 @@ class ArgMaxMeta : public OpMeta {
   size_t temp_storage_bytes = 0;
   int *d_offsets;
   void *d_out;
+  float *d_loss;
   Realm::RegionInstance reserveInst;
   ArgMaxMeta(FFHandler handler,
              Op const *op,
@@ -89,18 +90,22 @@ class ArgMax : public Op {
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
   static void forward_kernel(ArgMaxMeta const *m,
-                             DT *input_ptr,
+                             BatchConfig const *bc,
+                             DT const *input_ptr,
                              int *indices_ptr,
                              float *prob_ptr,
                              int *parent_ptr,
                              int length,
                              int batch_size,
+                             float *loss,
                              ffStream_t stream);
   static void forward_kernel_wrapper(ArgMaxMeta const *m,
-                                     GenericTensorAccessorW const &input,
+                                     BatchConfig const *bc,
+                                     GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &indices,
                                      GenericTensorAccessorW const &parent,
-                                     int batch_size);
+                                     int batch_size,
+                                     float *loss);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/ops/cache.h b/include/flexflow/ops/cache.h
index 1fbb1fa059..4f0b94ee5c 100644
--- a/include/flexflow/ops/cache.h
+++ b/include/flexflow/ops/cache.h
@@ -5,9 +5,11 @@
 
 namespace FlexFlow {
 
+class Cache;
+
 class CacheMeta : public OpMeta {
 public:
-  CacheMeta(FFHandler handle);
+  CacheMeta(FFHandler handle, Cache const *c);
   float cache_score;
 };
 
diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h
index ddef59549c..043b5d19a7 100644
--- a/include/flexflow/ops/element_unary.h
+++ b/include/flexflow/ops/element_unary.h
@@ -12,9 +12,11 @@
 
 namespace FlexFlow {
 
+class ElementUnary;
+
 class ElementUnaryMeta : public OpMeta {
 public:
-  ElementUnaryMeta(FFHandler handle);
+  ElementUnaryMeta(FFHandler handle, ElementUnary const *unary);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor, outputTensor;
   cudnnActivationDescriptor_t actiDesc;
diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h
index ed89fcf37a..c90e1773e0 100644
--- a/include/flexflow/ops/embedding.h
+++ b/include/flexflow/ops/embedding.h
@@ -60,6 +60,11 @@ class Embedding : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   // void update(const FFModel&);
   void print_layer(FFModel const &model) override {
     assert(0);
diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h
index d68957d890..1ed4678a5b 100644
--- a/include/flexflow/ops/experts.h
+++ b/include/flexflow/ops/experts.h
@@ -6,20 +6,11 @@
 
 namespace FlexFlow {
 
+class Experts;
+
 class ExpertsMeta : public OpMeta {
 public:
-  ExpertsMeta(FFHandler handler,
-              int _num_experts,
-              int _experts_start_idx,
-              int _data_dim,
-              int _out_dim,
-              int _experts_num_layers,
-              int _experts_internal_dim_size,
-              int _effective_batch_size,
-              int _num_chosen_experts,
-              float _alpha,
-              bool _use_bias,
-              ActiMode _activation);
+  ExpertsMeta(FFHandler handler, Experts const *e);
   ~ExpertsMeta(void);
 
   // Thrust helper arrays
@@ -138,7 +129,7 @@ class Experts : public Op {
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim);
diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h
index a8326e9ab4..02ab1db7b5 100644
--- a/include/flexflow/ops/fused.h
+++ b/include/flexflow/ops/fused.h
@@ -49,6 +49,11 @@ class FusedOp : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -60,6 +65,10 @@ class FusedOp : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h
index ec6cdfb9ab..73025216cd 100644
--- a/include/flexflow/ops/groupby.h
+++ b/include/flexflow/ops/groupby.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Group_by;
+
 class GroupByMeta : public OpMeta {
 public:
-  GroupByMeta(FFHandler handle, int n, float _alpha);
+  GroupByMeta(FFHandler handle, Group_by const *gb);
   ~GroupByMeta(void);
   float alpha;
   float **dev_region_ptrs;
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 43dc527bc8..f77df7c456 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -96,6 +96,11 @@ class IncMultiHeadSelfAttention : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -109,17 +114,27 @@ class IncMultiHeadSelfAttention : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
-
-  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m,
+  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
                                        BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &weight,
                                        GenericTensorAccessorW const &output,
                                        GenericTensorAccessorR const &bias);
+  static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
+                                      BatchConfig const *bc,
+                                      int shard_id,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &weight,
+                                      GenericTensorAccessorR const &output_grad,
+                                      GenericTensorAccessorR const &bias);
   Params get_params() const;
 
 public:
@@ -204,6 +219,10 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   //  typedef hipFloatComplex attFloatComplex;
   hipFloatComplex *complex_input;
 #endif
+  // PEFT specific fields
+  void *softmax_activation_buffer;
+  void *query_activation_buffer;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/batch_matmul_kernels.h b/include/flexflow/ops/kernels/batch_matmul_kernels.h
index 4de774ee06..c3923c4d4b 100644
--- a/include/flexflow/ops/kernels/batch_matmul_kernels.h
+++ b/include/flexflow/ops/kernels/batch_matmul_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class BatchMatmul;
+
 class BatchMatmulMeta : public OpMeta {
 public:
-  BatchMatmulMeta(FFHandler handler);
+  BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm);
   int a_seq_length_dim, b_seq_length_dim;
 };
 
diff --git a/include/flexflow/ops/kernels/cast_kernels.h b/include/flexflow/ops/kernels/cast_kernels.h
index 3001d913ca..d601601ea2 100644
--- a/include/flexflow/ops/kernels/cast_kernels.h
+++ b/include/flexflow/ops/kernels/cast_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Cast;
+
 class CastMeta : public OpMeta {
 public:
-  CastMeta(FFHandler handle);
+  CastMeta(FFHandler handle, Cast const *cast);
   DataType input_data_type, output_data_type;
 };
 
diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h
index 4da6aaf5e2..4562ae871a 100644
--- a/include/flexflow/ops/kernels/concat_kernels.h
+++ b/include/flexflow/ops/kernels/concat_kernels.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Concat;
+
 class ConcatMeta : public OpMeta {
 public:
-  ConcatMeta(FFHandler handle) : OpMeta(handle){};
+  ConcatMeta(FFHandler handle, Concat const *cc);
   int legion_axis;
 };
 
diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h
index 7b2a0fe135..f83e4687d7 100644
--- a/include/flexflow/ops/kernels/conv_2d_kernels.h
+++ b/include/flexflow/ops/kernels/conv_2d_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Conv2D;
+
 class Conv2DMeta : public OpMeta {
 public:
-  Conv2DMeta(FFHandler handler);
+  Conv2DMeta(FFHandler handler, Conv2D const *conv);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor;
   cudnnFilterDescriptor_t filterDesc;
diff --git a/include/flexflow/ops/kernels/flat_kernels.h b/include/flexflow/ops/kernels/flat_kernels.h
index caf817512d..6aa5a13b42 100644
--- a/include/flexflow/ops/kernels/flat_kernels.h
+++ b/include/flexflow/ops/kernels/flat_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Flat;
+
 class FlatMeta : public OpMeta {
 public:
-  FlatMeta(FFHandler handle) : OpMeta(handle){};
+  FlatMeta(FFHandler handle, Flat const *flat);
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index d1e0e050b2..3d122d4bc5 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -385,6 +385,25 @@ inline __device__ void zero(T &dst) {
   dst = tmp.raw;
 }
 
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(unsigned mask, T var, int srcLane, int width=warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width=warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+
 template <int THREADS_PER_KEY, typename K_vec, int N>
 inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) {
   // use float32 to get better accuracy
@@ -401,7 +420,7 @@ inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) {
   float qk = sum(qk_vec);
 #pragma unroll
   for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
-    qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    qk += WARP_SHFL_XOR(uint32_t(-1), qk, mask);
   }
   return qk;
 }
@@ -423,7 +442,7 @@ inline __device__ float block_sum(float *red_smem, float sum) {
 // Compute the sum per warp.
 #pragma unroll
   for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask);
   }
 
   // Warp leaders store the data to shared memory.
@@ -442,11 +461,11 @@ inline __device__ float block_sum(float *red_smem, float sum) {
 // Parallel reduction inside the warp.
 #pragma unroll
   for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask);
   }
 
   // Broadcast to other threads.
-  return __shfl_sync(uint32_t(-1), sum, 0);
+  return WARP_SHFL(uint32_t(-1), sum, 0);
 }
 
 template <typename DT>
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index a5fdc7c602..90e50a0c9a 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -35,6 +35,9 @@ class LinearMeta : public OpMeta {
   float kernel_reg_lambda;
   bool use_bias, add_bias_only_once;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *output_activation_buffer;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
@@ -48,6 +51,23 @@ void forward_kernel_wrapper(LinearMeta const *m,
                             int in_dim,
                             int out_dim,
                             int batch_size);
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *filter_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size);
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *kernel_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens);
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -73,6 +93,16 @@ void forward_kernel(LinearMeta const *m,
                     int batch_size,
                     ffStream_t stream);
 template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream);
+template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
                      void *input_grad_ptr,
@@ -85,6 +115,7 @@ void backward_kernel(LinearMeta const *m,
                      int out_dim,
                      int batch_size,
                      ffStream_t stream);
+
 template <typename DT>
 __global__ void build_one_ptr(DT *one_ptr, int batch_size);
 } // namespace Internal
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
new file mode 100644
index 0000000000..5360b5f8ea
--- /dev/null
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -0,0 +1,77 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
+#define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
+
+#include "flexflow/accessor.h"
+#include "flexflow/device.h"
+#include "flexflow/fftype.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/ops/lora_linear.h"
+
+namespace FlexFlow {
+
+struct LoraLinearWeight {
+  // weights
+  void *w0_ptr, *w1_ptr;
+  // gradients
+  void *w0_grad_ptr, *w1_grad_ptr;
+  // v values for SGD optimizer (when using momentum)
+  void *w0_v_values_ptr, *w1_v_values_ptr;
+  int in_dim, out_dim, rank, num_shards;
+};
+
+struct LoraLinearModelState {
+  LoraLinearWeight weights;
+  LoraOptimizerConfig const *optimizer_config;
+  float lora_alpha;
+  std::string cache_folder;
+  // Huggingface model ID (for download and/or upload)
+  std::string peft_model_id;
+};
+
+class LoraLinearMeta : public OpMeta {
+public:
+  LoraLinearMeta(FFHandler handle, LoraLinear const *li);
+  ~LoraLinearMeta(void);
+  // PEFT related fields
+  void *low_rank_activation;
+  void *input_activation;
+  std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
+};
+
+namespace Kernels {
+namespace LoraLinear {
+void init_kernel_wrapper(LoraLinearMeta *m, int seed);
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
+namespace Internal {
+template <typename DT>
+void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      ffStream_t stream);
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     ffStream_t stream);
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
+#endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h
index 7f73a8295d..c5a954763e 100644
--- a/include/flexflow/ops/kernels/pool_2d_kernels.h
+++ b/include/flexflow/ops/kernels/pool_2d_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Pool2D;
+
 class Pool2DMeta : public OpMeta {
 public:
-  Pool2DMeta(FFHandler handle);
+  Pool2DMeta(FFHandler handle, Pool2D const *pool);
   ffTensorDescriptor_t inputTensor, outputTensor;
   ffActivationDescriptor_t actiDesc;
   ffPoolingDescriptor_t poolDesc;
diff --git a/include/flexflow/ops/kernels/reshape_kernels.h b/include/flexflow/ops/kernels/reshape_kernels.h
index e6c8c4d569..5b6fa5be19 100644
--- a/include/flexflow/ops/kernels/reshape_kernels.h
+++ b/include/flexflow/ops/kernels/reshape_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Reshape;
+
 class ReshapeMeta : public OpMeta {
 public:
-  ReshapeMeta(FFHandler handler);
+  ReshapeMeta(FFHandler handler, Reshape const *reshape);
   DataType data_type;
 };
 
@@ -44,4 +46,4 @@ void backward_kernel(T *input_grad_ptr,
 } // namespace Kernels
 } // namespace FlexFlow
 
-#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
\ No newline at end of file
+#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 0eef4ca72b..fd4e0ecf1d 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_OPS_KERNELS_RESIDUAL_RMSNORM_KERNELS_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -31,13 +32,14 @@ class ResidualRMSNormMeta : public OpMeta {
   void *rms_ptr;
   void *norm_ptr;
 
-  float alpha;
-  float beta;
-
+  bool inplace_residual;
   int in_dim;
   int batch_size;
   int num_elements;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
@@ -48,6 +50,28 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
                             GenericTensorAccessorW const &output);
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output);
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad);
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
+                             GenericTensorAccessorR const &weight);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 35c5aa69fa..475b6d94ed 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -31,13 +32,13 @@ class RMSNormMeta : public OpMeta {
   void *rms_ptr;
   void *norm_ptr;
 
-  float alpha;
-  float beta;
-
   int in_dim;
   int batch_size;
   int num_elements;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
@@ -46,6 +47,22 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &output);
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output);
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad);
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight);
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index 8cfaf3c586..0b7f1090f6 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -23,20 +23,30 @@ class SoftmaxMeta : public OpMeta {
   bool profiling;
   bool inference_debugging;
   int dim;
-  DataType input_type, output_type;
 };
 
 namespace Kernels {
 namespace Softmax {
-template <typename DT>
+
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr);
-template <typename DT>
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output);
+
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
-                             size_t num_elements);
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              bool is_last_op,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad);
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
 
 namespace Internal {
 template <typename DT>
@@ -46,10 +56,28 @@ void forward_kernel(SoftmaxMeta const *m,
                     ffStream_t stream);
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      ffStream_t stream);
+
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_classes,
+                      ffStream_t stream);
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     ffStream_t stream);
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/include/flexflow/ops/kernels/transpose_kernels.h b/include/flexflow/ops/kernels/transpose_kernels.h
index 7ff6163b30..a2c8ff0483 100644
--- a/include/flexflow/ops/kernels/transpose_kernels.h
+++ b/include/flexflow/ops/kernels/transpose_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Transpose;
+
 class TransposeMeta : public OpMeta {
 public:
-  TransposeMeta(FFHandler handler) : OpMeta(handler){};
+  TransposeMeta(FFHandler handler, Transpose const *transpose);
   int num_dim;
   int perm[MAX_TENSOR_DIM];
 };
diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h
index 9e48d81190..b5e9538ea6 100644
--- a/include/flexflow/ops/layer_norm.h
+++ b/include/flexflow/ops/layer_norm.h
@@ -37,6 +37,11 @@ class LayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -67,6 +72,10 @@ class LayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
@@ -81,11 +90,6 @@ class LayerNorm : public Op {
                              T const *gamma_ptr,
                              T const *beta_ptr,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(LayerNormMeta const *m,
-                                     GenericTensorAccessorR const &input,
-                                     GenericTensorAccessorW &output,
-                                     GenericTensorAccessorR const &gamma,
-                                     GenericTensorAccessorR const &beta);
   template <typename T>
   static void backward_kernel(LayerNormMeta const *m,
                               T const *output_grad_ptr,
@@ -96,13 +100,34 @@ class LayerNorm : public Op {
                               T *beta_grad_ptr,
                               ffStream_t stream);
   template <typename T>
+  static void peft_bwd_kernel(LayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T *input_grad_ptr,
+                              T const *gamma_ptr,
+                              ffStream_t stream);
+
+  static void forward_kernel_wrapper(LayerNormMeta const *m,
+                                     GenericTensorAccessorR const &input,
+                                     GenericTensorAccessorW &output,
+                                     GenericTensorAccessorR const &gamma,
+                                     GenericTensorAccessorR const &beta);
   static void backward_kernel_wrapper(LayerNormMeta const *m,
-                                      T const *output_grad_ptr,
-                                      T const *input_ptr,
-                                      T *input_grad_ptr,
-                                      T const *gamma_ptr,
-                                      T *gamma_grad_ptr,
-                                      T *beta_grad_ptr);
+                                      GenericTensorAccessorR const &output_grad,
+                                      GenericTensorAccessorR const &input,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &gamma,
+                                      GenericTensorAccessorW const &gamma_grad,
+                                      GenericTensorAccessorW const &beta_grad);
+  static void inference_kernel_wrapper(LayerNormMeta *m,
+                                       BatchConfig const *bc,
+                                       GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorW &output,
+                                       GenericTensorAccessorR const &gamma,
+                                       GenericTensorAccessorR const &beta);
+  static void peft_bwd_kernel_wrapper(LayerNormMeta const *m,
+                                      GenericTensorAccessorR const &output_grad,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias;
@@ -124,6 +149,9 @@ class LayerNormMeta : public OpMeta {
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h
index a32df80537..ed2fad580f 100644
--- a/include/flexflow/ops/linear.h
+++ b/include/flexflow/ops/linear.h
@@ -52,6 +52,11 @@ class Linear : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override;
   bool get_int_parameter(PMParameter, int *) const override;
   static Op *
@@ -66,6 +71,10 @@ class Linear : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
new file mode 100644
index 0000000000..9e83c3f90e
--- /dev/null
+++ b/include/flexflow/ops/lora_linear.h
@@ -0,0 +1,99 @@
+#ifndef _FLEXFLOW_LORA_LINEAR_FIRST_H
+#define _FLEXFLOW_LORA_LINEAR_FIRST_H
+
+#include "flexflow/inference.h"
+#include "flexflow/node.h"
+#include "flexflow/operator.h"
+#include "flexflow/ops/lora_linear_params.h"
+#include "flexflow/utils/memory_allocator.h"
+
+namespace FlexFlow {
+
+class FFModel;
+class Layer;
+
+class LoraLinear : public Op {
+public:
+  using Params = LoraLinearParams;
+  using Input = std::pair<ParallelTensor, ParallelTensor>;
+
+  LoraLinear(
+      FFModel &model,
+      LayerID const &layer_guid,
+      OperatorType type,
+      ParallelTensor const input,
+      ParallelTensor const output,
+      std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+      char const *name = nullptr);
+  LoraLinear(FFModel &model,
+             LoraLinear const &other,
+             ParallelTensor const input,
+             ParallelTensor const output);
+  LoraLinear(FFModel &model,
+             Params const &params,
+             Input const &inputs,
+             char const *name = nullptr);
+
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override;
+  void map_output_tensors(FFModel &model) override;
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void forward_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+  void serialize(Legion::Serializer &) const override;
+  static PCG::Node deserialize(FFModel &ff,
+                               Legion::Deserializer &d,
+                               ParallelTensor inputs[],
+                               int num_inputs);
+  Op *materialize(FFModel &ff,
+                  ParallelTensor inputs[],
+                  int num_inputs) const override;
+  // size_t get_params_hash() const override;
+  LoraLinearParams get_params() const;
+
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXLOW_LORA_LINEAR_FIRST_H
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
new file mode 100644
index 0000000000..70539271f2
--- /dev/null
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -0,0 +1,150 @@
+#ifndef _FLEXFLOW_LORA_LINEAR_PARAMS_H
+#define _FLEXFLOW_LORA_LINEAR_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/inference.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/operator.h"
+#include "flexflow/parallel_tensor.h"
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <nlohmann/json.hpp>
+
+namespace FlexFlow {
+
+class LoraOptimizerConfig {
+public:
+  LoraOptimizerConfig();
+  virtual ~LoraOptimizerConfig() {}
+};
+
+class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
+public:
+  LoraSGDOptimizerConfig();
+  LoraSGDOptimizerConfig(double lr_,
+                         double momentum_ = 0.0f,
+                         bool nesterov_ = false,
+                         bool weight_decay_ = 0.0f);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraSGDOptimizerConfig const &llc);
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
+      LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay)
+
+public:
+  double lr = 0.001f;
+  double momentum = 0.0f;
+  bool nesterov = false;
+  double weight_decay = 0.0f;
+};
+
+class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
+public:
+  LoraAdamOptimizerConfig();
+  LoraAdamOptimizerConfig(double alpha_,
+                          double beta1_ = 0.9f,
+                          double beta2_ = 0.999f,
+                          double weight_decay_ = 0.0f,
+                          double epsilon_ = 1e-8);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraAdamOptimizerConfig const &llc);
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
+      LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon)
+
+public:
+  // Adam
+  double alpha = 0.001f;
+  double beta1 = 0.9f;
+  double beta2 = 0.999f;
+  double weight_decay = 0.0f;
+  double epsilon = 1e-8;
+};
+
+// Serialization helpers
+template <typename T>
+void serialize_to_json_file(T const &obj, fs::path const &filepath);
+
+// Function to deserialize JSON from file and create object
+template <typename T>
+std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath);
+
+class LoraLinearConfig {
+public:
+  static const LoraLinearConfig EmptyConfig;
+  LoraLinearConfig(std::string const &cache_folder_,
+                   std::string const &peft_model_id_,
+                   bool trainable_ = false,
+                   LoraOptimizerConfig *optimizer_config_ = nullptr,
+                   bool init_lora_weights_ = false,
+                   std::string const &base_model_name_or_path_ = "",
+                   std::string const &precision_ = "fp16",
+                   int rank_ = 8,
+                   float lora_alpha_ = 8.0f,
+                   float lora_dropout_ = 0.0f,
+                   std::vector<std::string> const &target_modules_ = {});
+  // constructor used to support std::unordered_map
+  LoraLinearConfig();
+  friend bool operator==(LoraLinearConfig const &lhs,
+                         LoraLinearConfig const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraLinearConfig const &llc);
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig,
+                                 cache_folder,
+                                 peft_model_id,
+                                 rank,
+                                 lora_alpha,
+                                 lora_dropout,
+                                 target_modules,
+                                 trainable,
+                                 init_lora_weights,
+                                 base_model_name_or_path,
+                                 precision)
+
+  std::string cache_folder;
+  // Huggingface model ID (for download and/or upload)
+  std::string peft_model_id;
+  // Lora parameters
+  int rank;
+  float lora_alpha;
+  float lora_dropout;
+  std::vector<std::string> target_modules;
+  // Training parameters
+  // whether the weights are trainable (fine-tuning scenario) or not
+  // (inference-only). If set to true, allocate space for the gradients
+  bool trainable = false;
+  LoraOptimizerConfig *optimizer_config;
+  // whether to initialize weights randomly (instead of attempting to load them
+  // from file)
+  bool init_lora_weights;
+  // parameters only used to upload model after finetuning
+  std::string base_model_name_or_path;
+  std::string precision;
+};
+
+class LoraLinearParams {
+public:
+  LayerID layer_guid;
+  OperatorType type;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  char name[MAX_OPNAME];
+
+  bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
+                    &input_shape) const;
+  friend bool operator==(LoraLinearParams const &lhs,
+                         LoraLinearParams const &rhs);
+};
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::LoraLinearParams> {
+  size_t operator()(FlexFlow::LoraLinearParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index 0e9be82125..33a8e8be51 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -26,8 +26,10 @@ class ResidualLayerNorm : public Op {
                     bool _elementwise_affine,
                     bool _use_bias,
                     float _eps,
+                    bool inplace_residual,
                     bool allocate_weights,
                     char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
@@ -40,6 +42,11 @@ class ResidualLayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -65,6 +72,14 @@ class ResidualLayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -78,7 +93,8 @@ class ResidualLayerNorm : public Op {
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(ResidualLayerNormMeta const *m,
+  static void inference_kernel_wrapper(ResidualLayerNormMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &residual1,
                                        GenericTensorAccessorR const &residual2,
@@ -86,11 +102,30 @@ class ResidualLayerNorm : public Op {
                                        GenericTensorAccessorW &output,
                                        GenericTensorAccessorR const &gamma,
                                        GenericTensorAccessorR const &beta);
+  static void
+      backward_kernel_wrapper(ResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR const &added_output,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorW const &residual1_grad,
+                              GenericTensorAccessorW const &residual2_grad,
+                              GenericTensorAccessorR const &gamma,
+                              GenericTensorAccessorW const &gamma_grad,
+                              GenericTensorAccessorW const &beta_grad);
+
+  static void
+      peft_bwd_kernel_wrapper(ResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorW const &residual1_grad,
+                              GenericTensorAccessorW const &residual2_grad,
+                              GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias, use_two_residuals;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   std::vector<int> axes;
 };
 
@@ -105,8 +140,12 @@ class ResidualLayerNormMeta : public OpMeta {
   bool elementwise_affine, use_bias, use_two_residuals;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h
index 949ae0c799..166d4b2b4e 100644
--- a/include/flexflow/ops/residual_layer_norm_params.h
+++ b/include/flexflow/ops/residual_layer_norm_params.h
@@ -13,6 +13,7 @@ struct ResidualLayerNormParams {
   float eps;
   bool use_bias;
   bool use_two_residuals;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(std::tuple<ParallelTensorShape,
                            ParallelTensorShape,
diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
index 0d92a236e8..bf75cd573a 100644
--- a/include/flexflow/ops/residual_rms_norm.h
+++ b/include/flexflow/ops/residual_rms_norm.h
@@ -20,6 +20,7 @@ class ResidualRMSNorm : public Op {
                   const ParallelTensor _input2,
                   float _eps,
                   int dim,
+                  bool inplace_residual,
                   bool allocate_weights,
                   char const *name);
   ResidualRMSNorm(FFModel &model,
@@ -32,6 +33,7 @@ class ResidualRMSNorm : public Op {
                   ResidualRMSNorm const &other,
                   Input const &inputs,
                   bool allocate_weights);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
@@ -44,6 +46,11 @@ class ResidualRMSNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -74,6 +81,14 @@ class ResidualRMSNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -82,6 +97,7 @@ class ResidualRMSNorm : public Op {
   float eps;
   int effective_batch_size;
   int dim, data_dim;
+  bool inplace_residual;
 };
 } // namespace FlexFlow
 #endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H
diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h
index a4e4de59ab..8b8f666dc1 100644
--- a/include/flexflow/ops/residual_rms_norm_params.h
+++ b/include/flexflow/ops/residual_rms_norm_params.h
@@ -11,6 +11,7 @@ struct ResidualRMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &input) const;
diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h
index 1dc940ebd3..384404d8a0 100644
--- a/include/flexflow/ops/rms_norm.h
+++ b/include/flexflow/ops/rms_norm.h
@@ -34,6 +34,11 @@ class RMSNorm : public Op {
   void init(FFModel const &) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
                       std::vector<ParallelTensor> const &,
@@ -73,6 +78,14 @@ class RMSNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
index 604438260a..ac60ff15dd 100644
--- a/include/flexflow/ops/sigmoid_silu_multi.h
+++ b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/memory_allocator.h"
@@ -27,6 +28,11 @@ class SigmoidSiluMulti : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -55,6 +61,14 @@ class SigmoidSiluMulti : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -65,10 +79,24 @@ class SigmoidSiluMulti : public Op {
                                T const *input2_ptr,
                                T *output_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+  static void inference_kernel_wrapper(SigmoidSiluMultiMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input1,
                                        GenericTensorAccessorR const &input2,
                                        GenericTensorAccessorW const &output);
+  static void
+      backward_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorW const &input1_grad,
+                              GenericTensorAccessorW const &input2_grad);
+  static void
+      peft_bwd_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW const &input1_grad,
+                              GenericTensorAccessorW const &input2_grad);
 };
 
 class SigmoidSiluMultiMeta : public OpMeta {
@@ -80,6 +108,9 @@ class SigmoidSiluMultiMeta : public OpMeta {
 
 public:
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h
index 61094f7361..82aff53766 100644
--- a/include/flexflow/ops/softmax.h
+++ b/include/flexflow/ops/softmax.h
@@ -33,6 +33,11 @@ class Softmax : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   void print_layer(FFModel const &model) override {
@@ -58,6 +63,10 @@ class Softmax : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h
index 47144bf6d7..4b67692032 100644
--- a/include/flexflow/ops/topk.h
+++ b/include/flexflow/ops/topk.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class TopK;
+
 class TopKMeta : public OpMeta {
 public:
-  TopKMeta(FFHandler handle);
+  TopKMeta(FFHandler handle, TopK const *topk);
   bool sorted;
 };
 
diff --git a/include/flexflow/ops/transpose.h b/include/flexflow/ops/transpose.h
index 3e6fb575c0..bca0b83460 100644
--- a/include/flexflow/ops/transpose.h
+++ b/include/flexflow/ops/transpose.h
@@ -6,6 +6,8 @@
 
 namespace FlexFlow {
 
+class TransposeMeta;
+
 class Transpose : public Op {
 public:
   using Params = TransposeParams;
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 02df0c0137..168ad5f618 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -144,7 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   ~TreeIncMultiHeadSelfAttentionMeta(void);
 
 public:
-  int num_active_tokens;
+  int num_active_infr_tokens;
   Realm::RegionInstance committed_token_reserve_inst;
   TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos;
   bool *request_completed;
diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h
index 045f9b36a0..7e0e4362e2 100644
--- a/include/flexflow/parallel_ops/allreduce.h
+++ b/include/flexflow/parallel_ops/allreduce.h
@@ -34,12 +34,17 @@ class AllReduce : public ParallelOp {
                       std::vector<ParallelTensor> const &,
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &bc,
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
-  void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
       std::vector<ParallelOpInfo> &parallel_ops) const override;
@@ -47,10 +52,6 @@ class AllReduce : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static void inference_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
@@ -59,6 +60,14 @@ class AllReduce : public ParallelOp {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h
index 2e4fdb86a9..1db776f59d 100644
--- a/include/flexflow/parallel_ops/combine.h
+++ b/include/flexflow/parallel_ops/combine.h
@@ -40,6 +40,11 @@ class Combine : public ParallelOp {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
@@ -52,10 +57,18 @@ class Combine : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   template <typename T>
   static void
       forward_task_with_type(Legion::Task const *task,
diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
index bdf7aae501..a4ccbee8a5 100644
--- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
@@ -17,11 +17,6 @@ class AllReduceMeta : public OpMeta {
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output);
-
 void forward_kernel_wrapper(AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
@@ -30,6 +25,15 @@ void backward_kernel_wrapper(AllReduceMeta const *m,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad);
 
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
 } // namespace AllReduce
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h
index 456013cd81..4b2227b178 100644
--- a/include/flexflow/parallel_ops/kernels/combine_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Combine;
+
 class CombineMeta : public OpMeta {
 public:
-  CombineMeta(FFHandler handle);
+  CombineMeta(FFHandler handle, Combine const *comb);
   DataType data_type;
 };
 
diff --git a/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h
new file mode 100644
index 0000000000..fd6778a37f
--- /dev/null
+++ b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H
+#define _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H
+
+#include "flexflow/batch_config.h"
+#include "flexflow/device.h"
+#include "flexflow/fftype.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
+
+namespace FlexFlow {
+
+class ParallelIdentityMeta : public OpMeta {
+public:
+  ParallelIdentityMeta(FFHandler handle, ParallelIdentity const *reduct);
+};
+
+namespace Kernels {
+namespace ParallelIdentity {
+
+void forward_kernel_wrapper(ParallelIdentityMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output);
+
+void backward_kernel_wrapper(ParallelIdentityMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
+void inference_kernel_wrapper(ParallelIdentityMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+
+void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+} // namespace ParallelIdentity
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H
diff --git a/include/flexflow/parallel_ops/kernels/partition_kernels.h b/include/flexflow/parallel_ops/kernels/partition_kernels.h
index 81b190603a..1e77090d11 100644
--- a/include/flexflow/parallel_ops/kernels/partition_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/partition_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Repartition;
+
 class RepartitionMeta : public OpMeta {
 public:
-  RepartitionMeta(FFHandler handle);
+  RepartitionMeta(FFHandler handle, Repartition const *repart);
   DataType data_type;
 };
 
diff --git a/include/flexflow/parallel_ops/parallel_identity.h b/include/flexflow/parallel_ops/parallel_identity.h
new file mode 100644
index 0000000000..b3ca789f08
--- /dev/null
+++ b/include/flexflow/parallel_ops/parallel_identity.h
@@ -0,0 +1,83 @@
+#ifndef _FLEXFLOW_PARALLEL_IDENTITY_H
+#define _FLEXFLOW_PARALLEL_IDENTITY_H
+
+#include "flexflow/layer.h"
+#include "flexflow/node.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/operator.h"
+#include "flexflow/parallel_ops/parallel_identity_params.h"
+#include "parallel_op.h"
+
+namespace FlexFlow {
+
+class ParallelIdentity : public ParallelOp {
+public:
+  using Params = ParallelIdentityParams;
+  using Input = ParallelTensor;
+
+  ParallelIdentity(FFModel &model,
+                   const ParallelTensor input,
+                   int parallel_identity_legion_dim,
+                   char const *name = NULL);
+  ParallelIdentity(FFModel &model,
+                   Params const &params,
+                   Input const input,
+                   char const *name = nullptr);
+  void create_input_partition(FFModel &model) override;
+  void create_input_partition_inference(
+      FFModel &model,
+      std::vector<ParallelTensor> const &batch_inputs,
+      std::vector<ParallelTensor> const &batch_outputs) override;
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
+  bool get_int_parameter(PMParameter, int *) const override;
+  bool append_parallel_op_info(
+      std::vector<ParallelOpInfo> &parallel_ops) const override;
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void forward_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+
+  Params get_params() const;
+
+public:
+  int parallel_identity_dim;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXFLOW_PARALLEL_IDENTITY_H
diff --git a/include/flexflow/parallel_ops/parallel_identity_params.h b/include/flexflow/parallel_ops/parallel_identity_params.h
new file mode 100644
index 0000000000..6eeed662ec
--- /dev/null
+++ b/include/flexflow/parallel_ops/parallel_identity_params.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H
+#define _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H
+
+namespace FlexFlow {
+
+struct ParallelIdentityParams {
+  int parallel_identity_legion_dim;
+  char name[MAX_OPNAME];
+  bool is_valid(ParallelTensorShape const &) const;
+};
+bool operator==(ParallelIdentityParams const &, ParallelIdentityParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::ParallelIdentityParams> {
+  size_t operator()(FlexFlow::ParallelIdentityParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H
diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h
index 0bf573996c..39324c2a51 100644
--- a/include/flexflow/parallel_ops/parallel_op.h
+++ b/include/flexflow/parallel_ops/parallel_op.h
@@ -41,7 +41,7 @@ class ParallelOp : public Op {
 public:
   Legion::LogicalPartition input_lp, output_grad_lp;
   std::unordered_map<ParallelTensor, Legion::LogicalPartition>
-      inference_input_lps;
+      inference_input_lps, inference_output_grad_lps;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h
index 65d69d8564..c27616634f 100644
--- a/include/flexflow/parallel_ops/replicate.h
+++ b/include/flexflow/parallel_ops/replicate.h
@@ -54,10 +54,19 @@ class Replicate : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_kernel_wrapper(ReplicateMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &output,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a38a3b2671..f0fab957ee 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -39,6 +39,7 @@ class InferenceManager {
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
       inference(FFModel *model, int index, BatchConfigFuture const &bc);
+  void peft_bwd(FFModel *model, int index, BatchConfigFuture const &bc);
   void load_input_tokens_from_batch_config(FFModel *model,
                                            BatchConfigFuture const &bc,
                                            ParallelTensor const input,
@@ -65,15 +66,34 @@ struct Request {
     FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
-  int max_sequence_length;
+  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
+  int max_sequence_length = 128;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
-
+  std::string prompt;
   std::vector<struct BeamTree> beam_trees;
+  // PEFT field
+  RequestType req_type = REQ_INFERENCE;
+  size_t processed_finetuning_tokens = 0;
+  int completed_training_steps = 0;
+  int dataset_entry_processed_tokens = 0;
+  int max_training_steps = 1;
+  // how many gradient accumulation steps to do before updating the weights. if
+  // left as -1, it will be set to the number of entries in the dataset
+  int gradient_accumulation_steps = -1;
+  int benchmarking_tokens = -1;
+  std::vector<int> finetuning_tokens_per_batch;
+  bool warmup = false;
+  std::string dataset_filepath;
+  std::vector<std::pair<std::vector<BatchConfig::TokenId>,
+                        std::vector<BatchConfig::TokenId>>>
+      dataset;
+  std::vector<float> finetuning_losses;
+  friend std::ostream &operator<<(std::ostream &os, Request const &req);
 };
 
 // store the result of beam search
@@ -120,6 +140,8 @@ class RequestManager {
   void set_max_sequence_length(int max_seq_length);
   void push_spec_infer_tree_width(int tree_width);
   int get_max_sequence_length();
+  void set_enable_peft_finetuning(bool enable_peft_finetuning_);
+  static void set_inference_finished(bool finished = true);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -143,10 +165,9 @@ class RequestManager {
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
-  RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length);
+  RequestGuid register_new_request(Request const &request_);
+  RequestGuid register_new_peft_request(Request const &request_);
+
   // Methods to start and terminate request manager's background task
   void start_background_server(FFModel *model);
   bool is_background_server_terminated();
@@ -156,6 +177,8 @@ class RequestManager {
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
   // Methods for preparing next batches
+  bool check_inf_req_completion(BatchConfig const &old_bc, int i);
+  void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
@@ -265,6 +288,10 @@ class RequestManager {
   int max_sequence_length;
   Status request_manager_status;
 
+  // peft benchmarking
+  bool enable_peft_finetuning = false;
+  static bool inference_finished;
+
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
 
@@ -275,7 +302,8 @@ class RequestManager {
   int bos_token_id;
   int eos_token_id;
   std::string output_filepath;
-  std::queue<Request> pending_request_queue;
+  std::queue<Request> pending_infr_request_queue;
+  std::queue<Request> pending_peft_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
@@ -304,6 +332,8 @@ class RequestManager {
     int llm_decoding_steps;
     int ssm_decoding_steps;
     double start_time, finish_time;
+    double registration_time, first_token_time;
+    bool first_token_time_set = false;
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h
index e410f66325..6cda96aa8b 100644
--- a/include/flexflow/simulator.h
+++ b/include/flexflow/simulator.h
@@ -33,21 +33,21 @@ namespace FlexFlow {
 
 #define MOD(a, b) ((a) % (b)) < 0 ? ((a) % (b)) + (b) : ((a) % (b))
 
-class Conv2DMeta;
-class LinearMeta;
-class Pool2DMeta;
-class ElementUnaryMeta;
-class ElementBinaryMeta;
-class LayerNormMeta;
-// class EmbeddingMeta;
-// class SoftmaxMeta;
-class BatchMatmulMeta;
-// class BatchNormMeta;
-class ConcatMeta;
-// class DropoutMeta;
-class TransposeMeta;
-class Op;
-class FFModel;
+// class Conv2DMeta;
+// class LinearMeta;
+// class Pool2DMeta;
+// class ElementUnaryMeta;
+// class ElementBinaryMeta;
+// class LayerNormMeta;
+//  class EmbeddingMeta;
+//  class SoftmaxMeta;
+// class BatchMatmulMeta;
+//  class BatchNormMeta;
+// class ConcatMeta;
+//  class DropoutMeta;
+// class TransposeMeta;
+// class Op;
+// class FFModel;
 
 /**
  * @brief Costs of an operator.
@@ -751,19 +751,19 @@ class Simulator {
       strict_hash_to_operator_cost;
 
 public:
-  Conv2DMeta *conv2d_meta;
-  LinearMeta *linear_meta;
-  Pool2DMeta *pool2d_meta;
-  ElementUnaryMeta *ele_unary_meta;
-  LayerNormMeta *layernorm_meta;
-  // ElementBinaryMeta *ele_binary_meta;
-  // EmbeddingMeta *embedding_meta;
-  // SoftmaxMeta *softmax_meta;
-  BatchMatmulMeta *batch_matmul_meta;
-  // BatchNormMeta *batch_norm_meta;
-  ConcatMeta *concat_meta;
-  // DropoutMeta *dropout_meta;
-  TransposeMeta *transpose_meta;
+  // Conv2DMeta *conv2d_meta;
+  // LinearMeta *linear_meta;
+  // Pool2DMeta *pool2d_meta;
+  // ElementUnaryMeta *ele_unary_meta;
+  // LayerNormMeta *layernorm_meta;
+  //  ElementBinaryMeta *ele_binary_meta;
+  //  EmbeddingMeta *embedding_meta;
+  //  SoftmaxMeta *softmax_meta;
+  // BatchMatmulMeta *batch_matmul_meta;
+  //  BatchNormMeta *batch_norm_meta;
+  // ConcatMeta *concat_meta;
+  //  DropoutMeta *dropout_meta;
+  // TransposeMeta *transpose_meta;
   int segment_size;
   int max_num_segments; // simulation could be slow if the number of segments
                         // are too large
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f8bf67b3e1..486a65eb3d 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) {
   return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret;
 }
 
-__global__ void
-    scale_kernel(float *ptr, Legion::coord_t size, float a, float b);
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
 
 __global__ void ones_kernel(float *ptr, Legion::coord_t size);
 
@@ -156,10 +156,13 @@ template <typename T>
 void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
 
 template <typename T>
-T *download_tensor(T const *ptr, size_t num_elements);
+T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements);
+
+template <typename T>
+void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements);
 
 template <typename T>
-bool download_tensor(T const *ptr, T *dst, size_t num_elements);
+void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements);
 
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor,
                                                  Legion::Domain domain,
@@ -179,3 +182,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type);
 cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type);
 cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type);
 #endif
+void check_device_vs_host_ptr(void const *maybe_devicePtr);
+void check_ptr_alignment(void const *ptr);
diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h
index 5d3c831d4f..805cc46b4c 100644
--- a/include/flexflow/utils/hip_helper.h
+++ b/include/flexflow/utils/hip_helper.h
@@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) {
   return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret;
 }
 
-__global__ void
-    scale_kernel(float *ptr, Legion::coord_t size, float a, float b);
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
 
 __global__ void ones_kernel(float *ptr, Legion::coord_t size);
 
@@ -86,6 +86,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value);
 template <typename DT>
 __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size);
 
+template <typename DT>
+__global__ void copy_kernel_discrete(DT *dst,
+                                     const DT *src,
+                                     Legion::coord_t size,
+                                     size_t *index);
+
 template <typename T>
 __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size);
 
@@ -135,16 +141,28 @@ __host__ void updateGAS(float *para_ptr,
                         float learning_rate);
 
 template <typename T>
-void print_tensor(T const *ptr, size_t num_elements, char const *prefix);
+void print_tensor(T const *ptr,
+                  size_t num_elements,
+                  char const *prefix,
+                  int shard_id = 0);
+template <typename T>
+void print_beam_tensor(T const *ptr,
+                       size_t num_elements,
+                       int skip,
+                       int channel,
+                       char const *prefix);
 
 template <typename T>
 void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
 
 template <typename T>
-T *download_tensor(T const *ptr, size_t num_elements);
+T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements);
+
+template <typename T>
+void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements);
 
 template <typename T>
-bool download_tensor(T const *ptr, T *dst, size_t num_elements);
+void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements);
 
 miopenStatus_t
     cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor,
@@ -153,7 +171,8 @@ miopenStatus_t
 
 miopenStatus_t
     cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor,
-                                               Legion::Domain domain);
+                                               Legion::Domain domain,
+                                               DataType data_type = DT_FLOAT);
 
 hipblasDatatype_t ff_to_cuda_datatype(DataType type);
 
@@ -164,3 +183,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type);
 
 void handle_unimplemented_hip_kernel(OperatorType op_type);
 #endif
+void check_device_vs_host_ptr(void const *maybe_devicePtr);
+void check_ptr_alignment(void const *ptr);
diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h
index 7091b159b2..fad7630770 100644
--- a/include/flexflow/utils/memory_allocator.h
+++ b/include/flexflow/utils/memory_allocator.h
@@ -54,6 +54,11 @@ class MemoryAllocator {
     return static_cast<DT *>(ptr);
   }
 
+  inline void free_all() {
+    reserved_allocated_size = 0;
+    instance_allocated_size = 0;
+  }
+
 public:
   Legion::Memory memory;
   void *reserved_ptr;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
new file mode 100644
index 0000000000..dae46a8af1
--- /dev/null
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
+#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
+
+#include "flexflow/config.h"
+#include <mutex>
+
+namespace FlexFlow {
+
+class PEFTWeightAllocator {
+public:
+  PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
+      : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0),
+        local_offset(_total_size) {}
+
+  inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id,
+                                             size_t datalen) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    void *ptr = static_cast<char *>(base_ptr) + sync_offset;
+    off_t model_sync_weights_offset = sync_offset;
+    size_t model_sync_weights_size = datalen;
+    if (sync_weights.find(peft_model_id) != sync_weights.end()) {
+      // Assert that sync weights for each PEFT model is consecutive
+      std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
+      assert(sync_offset == offset_and_size.first + offset_and_size.second);
+      model_sync_weights_offset = offset_and_size.first;
+      model_sync_weights_size = offset_and_size.second + datalen;
+    }
+    sync_offset += datalen;
+    assert(sync_offset < local_offset);
+    sync_weights[peft_model_id] =
+        std::make_pair(model_sync_weights_offset, model_sync_weights_size);
+    return ptr;
+  }
+
+  std::pair<void *, size_t>
+      get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    assert(sync_weights.find(peft_model_id) != sync_weights.end());
+    std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
+    return std::make_pair(static_cast<char *>(base_ptr) + offset_and_size.first,
+                          offset_and_size.second);
+  }
+
+  inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id,
+                                              size_t datalen) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    local_offset -= datalen;
+    assert(sync_offset < local_offset);
+    void *ptr = static_cast<char *>(base_ptr) + local_offset;
+    return ptr;
+  }
+
+  template <typename DT>
+  inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id,
+                                   size_t count) {
+    return static_cast<DT *>(
+        allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count));
+  }
+
+  template <typename DT>
+  inline DT *allocate_local_weights(PEFTModelID const &peft_model_id,
+                                    size_t count) {
+    return static_cast<DT *>(
+        allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count));
+  }
+
+public:
+  void *base_ptr;
+  size_t total_size;
+  off_t sync_offset, local_offset;
+  std::unordered_map<PEFTModelID, std::pair<off_t, size_t>> sync_weights;
+  std::mutex peft_weight_allocator_mutex;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md
deleted file mode 100644
index d78fb37be9..0000000000
--- a/inference/MODEL_WEIGHTS.md
+++ /dev/null
@@ -1,28 +0,0 @@
-To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-for name, params in model.named_parameters():
-    for name, params in model.named_parameters():
-    name = (
-        name.replace(".", "_")
-        .replace("self_attn", "attention")
-        .replace("q_proj", "wq")
-        .replace("k_proj", "wk")
-        .replace("v_proj", "wv")
-        .replace("o_proj", "wo")
-        .replace("mlp", "feed_forward")
-        .replace("gate_proj", "w1")
-        .replace("down_proj", "w2")
-        .replace("up_proj", "w3")
-        .replace("input_layernorm", "attention_norm")
-        .replace("post_attention_layernorm", "ffn_norm")
-        .replace("embed_tokens", "tok_embeddings")
-        .replace("lm_head", "output")
-        .replace("model_", "")
-    )
-    params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name)
-```
-
diff --git a/inference/README.md b/inference/README.md
new file mode 100644
index 0000000000..14c94e22ac
--- /dev/null
+++ b/inference/README.md
@@ -0,0 +1,42 @@
+# Inference Examples
+This folder contains the code to run inference examples in FlexFlow
+
+To create a sample prompt, call (from the `build` folder):
+
+```bash
+mkdir -p ../inference/prompt
+echo '["San Francisco is a "]' > ../inference/prompt/test.json
+```
+
+To download a model for use in C++, call:
+```bash
+huggingface-cli login # if needed
+python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only
+```
+
+To run the incremental decoding example in C++, call:
+
+```bash
+./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4
+```
+
+To run the speculative inference example in C++, call:
+
+```bash
+./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4
+```
+
+To run a PEFT model example in C++, call:
+
+```bash
+./inference/peft/peft \
+    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 4 \
+    -ll:fsize 8192 -ll:zsize 12000 \
+    -llm-model JackFram/llama-160m \
+    -finetuning-dataset ../inference/prompt/peft_dataset.json \
+    -peft-model goliaro/llama-160m-lora \
+    -enable-peft \
+    --use-full-precision \
+    --inference-debugging
+```
\ No newline at end of file
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index ec3dda3158..c9ffff5c07 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -264,15 +264,18 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-    std::vector<std::string> prompts;
+
+    std::vector<Request> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
-      prompts.push_back(text);
     }
-    std::vector<GenerationResult> result =
-        model.generate(prompts, 128 /*max_sequence_length*/);
+    std::vector<GenerationResult> result = model.generate(requests);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index a529411ddb..195d6ba7e3 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_layer_norm(
@@ -89,8 +89,9 @@ void FALCON::create_falcon_model(FFModel &ff,
           true,
           falcon_config.layer_norm_epsilon,
           true,
+          false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = res_ln_outputs[0];
       att_norm = res_ln_outputs[1];
@@ -116,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -141,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -166,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -187,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h")
             .c_str());
 
     dense_h_to_4h = ff.gelu(dense_h_to_4h);
@@ -203,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h")
             .c_str());
   }
   // final normalization and linear
@@ -216,6 +217,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                          true,
                          falcon_config.layer_norm_epsilon,
                          true,
+                         false,
                          DT_NONE,
                          "ln_f");
   Tensor ln_f = res_ln_outputs[1];
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 517f534438..cf26194597 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "tok_embeddings");
+                              "embed_tokens");
 
   Tensor w2 = nullptr;
 
@@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_rms_norm(
@@ -84,8 +84,9 @@ void LLAMA::create_llama_model(FFModel &ff,
           token_att_norm,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
+          false, // inplace_residual
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
@@ -94,10 +95,11 @@ void LLAMA::create_llama_model(FFModel &ff,
     Tensor mha;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multihead_self_attention(
+        mha = ff.spec_inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -111,16 +113,17 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multihead_self_attention_verify(
+        mha = ff.inc_multiquery_self_attention_verify(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -134,16 +137,17 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multihead_self_attention(
+        mha = ff.inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -157,7 +161,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -175,54 +179,56 @@ void LLAMA::create_llama_model(FFModel &ff,
         token_ff_norm,
         llama_config.rms_norm_eps,
         llama_config.hidden_size,
+        false, // inplace_residual
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
+        std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
+            .c_str());
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
-                     .c_str());
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
-                     .c_str());
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str());
 
     Tensor multi = ff.sigmoid_silu_multi(w1, w3);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
-                     .c_str());
+    w2 = ff.dense(
+        multi,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    // ff.lora_linear(std::string("down_proj"), std::string("layers." +
+    // std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -231,6 +237,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                        final_rms_norm_output,
                        llama_config.rms_norm_eps,
                        llama_config.hidden_size,
+                       false, // inplace_residual
                        DT_NONE,
                        "norm");
 
@@ -244,7 +251,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                           nullptr,
                           REG_MODE_NONE,
                           0.0f,
-                          "output");
+                          "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -261,7 +268,8 @@ void LLAMA::create_llama_model(FFModel &ff,
       output = ff.sampling(softmax, generation_config.topp);
     } else {
       // output = ff.arg_top_k(dense, /*k=*/1, false);
-      output = ff.argmax(dense, /*beam_Search*/ false);
+      Tensor softmax = ff.softmax(dense, -1);
+      output = ff.argmax(softmax, /*beam_Search*/ false);
     }
   }
 
@@ -269,7 +277,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       "",
       weight_file_path,
       llama_config.num_attention_heads,
-      llama_config.num_attention_heads,
+      llama_config.num_key_value_heads,
       llama_config.hidden_size,
       llama_config.hidden_size / llama_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
diff --git a/inference/models/llama.h b/inference/models/llama.h
index ba1f0236f9..edb78f1300 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -36,6 +36,11 @@ class LLAMA {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           num_attention_heads = model_config["num_attention_heads"];
+          if (model_config.find("num_key_value_heads") != model_config.end()) {
+            num_key_value_heads = model_config["num_key_value_heads"];
+          } else {
+            num_key_value_heads = num_attention_heads;
+          }
           hidden_size = model_config["hidden_size"];
           rms_norm_eps = model_config["rms_norm_eps"];
           intermediate_size = model_config["intermediate_size"];
@@ -61,6 +66,8 @@ class LLAMA {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tnum_attention_heads: " << num_attention_heads
                 << std::endl;
+      std::cout << "\tnum_key_value_heads: " << num_key_value_heads
+                << std::endl;
       std::cout << "\thidden_size: " << hidden_size << std::endl;
       std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl;
       std::cout << "\tintermediate_size: " << intermediate_size << std::endl;
@@ -73,8 +80,8 @@ class LLAMA {
 
     // int max_seq_len, max_num_tokens;
     int max_beam_width, max_beam_depth;
-    int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
-        intermediate_size;
+    int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads,
+        hidden_size, intermediate_size;
     float rms_norm_eps;
   };
 
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 70e2b5e9c5..e4a7e0056d 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff,
                                       use_full_precision ? DT_FLOAT : DT_HALF,
                                       NULL,
                                       embed_init,
-                                      "transformer_wte");
+                                      "wte");
 
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
     } else {
       ff.residual_layer_norm(
           intermediate_output,
@@ -86,8 +86,9 @@ void MPT::create_mpt_model(FFModel &ff,
           true,
           1e-05,
           false,
+          false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
       hidden_states = res_ln_outputs[0];
       layernorm_output = res_ln_outputs[1];
     }
@@ -113,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -137,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -161,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -181,8 +182,9 @@ void MPT::create_mpt_model(FFModel &ff,
         true,
         1e-05,
         false,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
+        std::string("layers." + std::to_string(i) + ".norm_2").c_str());
     hidden_states = res_ln_outputs[0];
     layernorm_output = res_ln_outputs[1];
 
@@ -198,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str());
     layernorm_output = ff.gelu(layernorm_output);
     intermediate_output = ff.dense(
         layernorm_output,
@@ -211,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str());
   }
 
   // final
@@ -224,8 +226,9 @@ void MPT::create_mpt_model(FFModel &ff,
                          true,
                          1e-05,
                          false,
+                         false,
                          DT_NONE,
-                         "transformer_norm_f");
+                         "norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(all_final_norm,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 5677d5658e..b3f2ef4e17 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -94,8 +94,9 @@ void OPT::create_opt_model(FFModel &ff,
         opt_config.layer_norm_elementwise_affine,
         1e-05,
         true,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
+        std::string("layers." + std::to_string(i) + ".self_attn_layer_norm")
             .c_str());
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
@@ -121,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -145,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -169,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -186,9 +187,10 @@ void OPT::create_opt_model(FFModel &ff,
                                     opt_config.layer_norm_elementwise_affine,
                                     1e-05,
                                     true,
+                                    false,
                                     DT_NONE,
-                                    std::string("layers_" + std::to_string(i) +
-                                                "_add_bias_residual_layer_norm")
+                                    std::string("layers." + std::to_string(i) +
+                                                ".add_bias_residual_layer_norm")
                                         .c_str());
     added = res_ln_outputs[0];
     Tensor final_norm = res_ln_outputs[1];
@@ -205,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_fc1").c_str());
+                 std::string("layers." + std::to_string(i) + ".fc1").c_str());
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -216,7 +218,10 @@ void OPT::create_opt_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+                   std::string("layers." + std::to_string(i) + ".fc2").c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    // ff.lora_linear(std::string("fc2"), std::string("layers." +
+    // std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
@@ -229,6 +234,7 @@ void OPT::create_opt_model(FFModel &ff,
                          opt_config.layer_norm_elementwise_affine,
                          1e-05,
                          true,
+                         false,
                          DT_NONE,
                          "final_layer_norm");
   Tensor all_final_norm = res_ln_outputs[1];
@@ -243,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff,
                             nullptr,
                             REG_MODE_NONE,
                             0.0f,
-                            "embed_tokens_weight_lm_head");
+                            "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -252,7 +258,8 @@ void OPT::create_opt_model(FFModel &ff,
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
-    output = ff.argmax(lm_head, /*beam_Search*/ false);
+    Tensor softmax = ff.softmax(lm_head, -1);
+    output = ff.argmax(softmax, /*beam_Search*/ false);
   }
 
   FileDataLoader *fileloader = new FileDataLoader(
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 8b0dc1098c..cd8bf3a9a7 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model(
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "transformer_wte");
+                              "wte");
 
   Tensor positional_embedding =
       ff.embedding(position_input,
@@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model(
                    use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
-                   "transformer_wpe");
+                   "wpe");
 
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -96,8 +96,9 @@ void STARCODER::create_starcoder_model(
         true,
         startcoder_config.layer_norm_epsilon,
         true,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
@@ -124,7 +125,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
         break;
@@ -144,8 +145,9 @@ void STARCODER::create_starcoder_model(
         true,
         startcoder_config.layer_norm_epsilon,
         true,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_2").c_str());
     residual = res_ln_outputs[0];
     Tensor l2_norm = res_ln_outputs[1];
 
@@ -161,7 +163,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str());
 
     c_fc = ff.gelu(c_fc);
 
@@ -176,7 +178,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str());
   }
   // final normalization and linear
   ff.residual_layer_norm(residual,
@@ -188,8 +190,9 @@ void STARCODER::create_starcoder_model(
                          true,
                          startcoder_config.layer_norm_epsilon,
                          true,
+                         false,
                          DT_NONE,
-                         "transformer_ln_f");
+                         "ln_f");
   Tensor ln_f = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(ln_f,
diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
new file mode 100644
index 0000000000..e0bad79cab
--- /dev/null
+++ b/inference/peft/CMakeLists.txt
@@ -0,0 +1,139 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_Peft)
+
+# Normal PEFT
+set(project_target1 peft)
+set(CPU_SRC1
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})
+
+# FWD benchmark
+set(project_target2 peft_fwd_benchmark)
+set(CPU_SRC2
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft_fwd_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target2} ${CPU_SRC2})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target2} ${CPU_SRC2})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target2} DESTINATION ${BIN_DEST})
+
+# BWD benchmark
+set(project_target3 peft_bwd_benchmark)
+set(CPU_SRC3
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft_bwd_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})
+
+# Online peft
+set(project_target4 req_rate_benchmark)
+set(CPU_SRC4
+  ${FLEXFLOW_CPP_DRV_SRC}
+  req_rate_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target4} ${CPU_SRC4})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC4} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target4} ${CPU_SRC4})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target4} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target4} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target4} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target4} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target4} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target4} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/Makefile b/inference/peft/Makefile
new file mode 100644
index 0000000000..0e4b79f51f
--- /dev/null
+++ b/inference/peft/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
new file mode 100644
index 0000000000..c55f2c0bfd
--- /dev/null
+++ b/inference/peft/peft.cc
@@ -0,0 +1,387 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string dataset_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // dataset for finetuning
+    if (!strcmp(argv[i], "-finetuning-dataset")) {
+      paths.dataset_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 1;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  bool enable_peft_finetuning = true;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  LoraOptimizerConfig *optim_config = nullptr;
+  if (enable_peft_finetuning) {
+    // float sgd_learning_rate = 2e-1;
+    float sgd_learning_rate = 1.0f;
+    optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate);
+  }
+  LoraLinearConfig peft_config_finetuning =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path,
+                             peft_model_name,
+                             true /*trainable*/,
+                             optim_config,
+                             false /*init_lora_weights*/,
+                             llm_model_name,
+                             use_full_precision ? "fp32" : "fp16");
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+    if (enable_peft_finetuning) {
+      peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning);
+    }
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    if (!file_paths.prompt_file_path.empty()) {
+      using json = nlohmann::json;
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      int total_num_requests = 0;
+      for (auto &prompt : prompt_json) {
+        std::string text = prompt.get<std::string>();
+        printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
+        Request inference_req;
+        inference_req.prompt = text;
+        inference_req.max_sequence_length = 128;
+        inference_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        requests.push_back(inference_req);
+        total_num_requests++;
+      }
+    }
+
+    // Add fine-tuning request
+    if (enable_peft_finetuning) {
+      assert(!file_paths.dataset_file_path.empty() &&
+             "Dataset file path is required for fine-tuning.");
+      printf("Finetuning request with dataset %s\n",
+             file_paths.dataset_file_path.c_str());
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.peft_model_id = (peft_model_id_finetuning != nullptr)
+                                          ? *peft_model_id_finetuning
+                                          : PEFTModelID::NO_ID;
+      fine_tuning_req.dataset_filepath = file_paths.dataset_file_path;
+      fine_tuning_req.max_training_steps = 2;
+      requests.push_back(fine_tuning_req);
+    }
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
new file mode 100644
index 0000000000..86d6d8cbbf
--- /dev/null
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -0,0 +1,391 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_requests_to_run) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_requests_to_run);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    std::vector<int> lengths;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+      if (index == max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry.get<int>();
+      assert(prompt_length > 0 && "Prompt length must be greater than 0.");
+      assert(prompt_length <= 1024 &&
+             "Prompt length must be less than or equal to 1024.");
+      lengths.push_back(prompt_length);
+      index++;
+    }
+    printf("Total number of finetuning requests: %ld", lengths.size());
+
+    // Add fine-tuning requests
+    for (int i = 0; i < lengths.size(); i++) {
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.benchmarking_tokens = lengths[i];
+      fine_tuning_req.max_sequence_length = lengths[i];
+      fine_tuning_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      fine_tuning_req.max_training_steps = 1;
+      requests.push_back(fine_tuning_req);
+    }
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------finetuning finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
new file mode 100644
index 0000000000..9ff042c157
--- /dev/null
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -0,0 +1,363 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_requests_to_run) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_requests_to_run);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    std::vector<std::pair<int, int>> prompts;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+      if (index >= max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry["human"];
+      int sequence_length = entry["gpt"];
+      assert(prompt_length + sequence_length <= max_sequence_length &&
+             "Prompt + sequence length exceeds max sequence length");
+      prompts.push_back(std::make_pair(prompt_length, sequence_length));
+      index++;
+    }
+    printf("Total number of prompts: %ld", prompts.size());
+    for (auto &prompt : prompts) {
+      // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+      // sequence_length);
+      Request inference_req;
+      inference_req.benchmarking_tokens = prompt.first;
+      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
new file mode 100644
index 0000000000..43008e74fe
--- /dev/null
+++ b/inference/peft/req_rate_benchmark.cc
@@ -0,0 +1,518 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "inference/models/falcon.h"
+#include "inference/models/llama.h"
+#include "inference/models/mpt.h"
+#include "inference/models/opt.h"
+#include "inference/models/starcoder.h"
+#include <chrono>
+#include <mutex>
+#include <thread>
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+class ConcurrentQueue {
+public:
+  std::queue<RequestManager::RequestGuid> inf_queue;
+  std::queue<RequestManager::RequestGuid> peft_queue;
+  std::mutex request_queue_mutex;
+  bool producer_finished = false;
+};
+
+ConcurrentQueue *common_guids_singleton = nullptr;
+int nb_millisecs = 1000; // Default bucket timeframe is 1 second
+
+ConcurrentQueue *get_common_guids_queue() {
+  if (common_guids_singleton == nullptr) {
+    common_guids_singleton = new ConcurrentQueue();
+  }
+  return common_guids_singleton;
+}
+
+void consume() {
+  RequestManager *rm = RequestManager::get_request_manager();
+  ConcurrentQueue *guids = get_common_guids_queue();
+  bool producer_is_finished = false;
+  bool queue_is_empty = false;
+  // int i=0;
+  while (!producer_is_finished || !queue_is_empty) {
+    RequestManager::RequestGuid guid = RequestManager::INVALID_GUID;
+    {
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      queue_is_empty = guids->inf_queue.empty();
+      producer_is_finished = guids->producer_finished;
+      if (!queue_is_empty) {
+        guid = guids->inf_queue.front();
+        guids->inf_queue.pop();
+      }
+    }
+    if (guid != RequestManager::INVALID_GUID) {
+      GenerationResult result = rm->get_generation_result(guid);
+    } else {
+      std::this_thread::sleep_for(std::chrono::milliseconds(nb_millisecs));
+    }
+    // i++;
+    // cout << "Iteration " << i;
+  }
+  rm->set_inference_finished();
+
+  while (guids->peft_queue.size() > 0) {
+    GenerationResult result =
+        rm->get_generation_result(guids->peft_queue.front());
+    guids->peft_queue.pop();
+  }
+}
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_buckets_to_run,
+                      int &bucket_timeframe) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-buckets-to-run")) {
+      max_buckets_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--bucket-timeframe")) {
+      bucket_timeframe = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_buckets_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+  int bucket_timespan = 1;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_buckets_to_run,
+                   bucket_timespan);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  rm->start_background_server(&model);
+
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Now run online workload!
+
+  nb_millisecs = nb_millisecs * bucket_timespan;
+  int total_num_requests = 0;
+  int num_arrival_buckets = 0;
+  ConcurrentQueue *guids = get_common_guids_queue();
+  std::thread consumer{consume};
+  {
+
+    // Load all requests in advance
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+
+    auto const &lists = prompt_json.get<std::vector<std::vector<json>>>();
+    std::vector<size_t> bucket_arrival_times_s;
+    std::vector<std::vector<std::pair<int, int>>> buckets;
+
+    size_t index = 0;
+    for (auto const &list : lists) {
+      if (!list.empty()) {
+        bucket_arrival_times_s.push_back(index);
+        std::vector<std::pair<int, int>> prompts;
+        for (auto const &dict : list) {
+          int prompt_length = dict["human"];
+          int sequence_length = dict["gpt"];
+          assert(prompt_length + sequence_length <= max_sequence_length &&
+                 "Prompt + sequence length exceeds max sequence length");
+          prompts.push_back(std::make_pair(prompt_length, sequence_length));
+        }
+        buckets.push_back(prompts);
+      }
+      index++;
+    }
+    assert(bucket_arrival_times_s.size() == buckets.size() &&
+           "Bucket arrival times and buckets are not the same size");
+    // for (int i=0; i<10; i++) {
+    //   printf("bucket_arrival_times_s[%i]: %i\n", i,
+    //   bucket_arrival_times_s[i]); printf("bucket[%i]: %i\n", i,
+    //   buckets[i].size()); for (const auto& prompt : buckets[i]) {
+    //     printf("\tprompt: %i, %i\n", prompt.first, prompt.second);
+    //   }
+    // }
+
+    // Add fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1000000000;
+    RequestManager::RequestGuid ft_guid =
+        rm->register_new_peft_request(fine_tuning_req);
+    if (ft_guid != RequestManager::INVALID_GUID) {
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      guids->peft_queue.push(ft_guid);
+    }
+
+    // Replay the trace of inference requests
+    auto start_time = std::chrono::steady_clock::now();
+    for (int i = 0; i < bucket_arrival_times_s.size(); i++) {
+      if (bucket_arrival_times_s[i] >= max_buckets_to_run) {
+        break;
+      }
+      // sleep until bucket arrives
+      auto bucket_arrival_time =
+          start_time +
+          std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs);
+      std::this_thread::sleep_until(bucket_arrival_time);
+
+      // create inference requests for the bucket
+      std::vector<Request> requests;
+      for (auto const &prompt : buckets[i]) {
+        // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+        // sequence_length);
+        Request inference_req;
+        inference_req.benchmarking_tokens = prompt.first;
+        inference_req.max_sequence_length = prompt.second + prompt.first;
+        inference_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        requests.push_back(inference_req);
+      }
+
+      {
+        const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+        for (int i = 0; i < requests.size(); i++) {
+          RequestManager::RequestGuid guid =
+              rm->register_new_request(requests.at(i));
+          if (guid != RequestManager::INVALID_GUID) {
+            guids->inf_queue.push(guid);
+          }
+        }
+      }
+    }
+
+    { // Notify the consumer that no more requests are incoming
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      guids->producer_finished = true;
+    }
+  }
+
+  // Wait for consumer to finish
+  consumer.join();
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
new file mode 100644
index 0000000000..a7d38a66b6
--- /dev/null
+++ b/inference/python/ff_peft.py
@@ -0,0 +1,189 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+        type=str,
+        default="",
+    )
+    args = parser.parse_args()
+
+    # Load configs from JSON file (if specified)
+    if len(args.config_file) > 0:
+        if not os.path.isfile(args.config_file):
+            raise FileNotFoundError(f"Config file {args.config_file} not found.")
+        try:
+            with open(args.config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 2,
+            "memory_per_gpu": 14000,
+            "zero_copy_memory_per_node": 10000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 4,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 2,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": True,
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
+            "profiling": False,
+            "inference_debugging": True,
+            "fusion": False,
+        }
+        model_configs = {
+            # required parameters
+            "base_model": "JackFram/llama-160m",
+            "inference_peft_model_id": "goliaro/llama-160m-lora",
+            "finetuning_peft_model_id": "goliaro/llama-160m-lora",
+            # "base_model": "meta-llama/Meta-Llama-3-8B",
+            # "inference_peft_model_id": "goliaro/llama-3-8b-lora",
+            # "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly",
+            # optional parameters
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "refresh_cache": False,
+            "full_precision": True,
+            "prompt": "",
+            "finetuning_dataset": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "../prompt/peft_dataset.json",
+            ),
+            "output_file": "",
+        }
+        # Merge dictionaries
+        ff_init_configs.update(model_configs)
+        return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+    # Add inference and/or finetuning lora
+    lora_inference_config = None
+    lora_finetuning_config = None
+    if len(configs.prompt) > 0:
+        lora_inference_config = ff.LoraLinearConfig(
+            llm.cache_path,
+            configs.inference_peft_model_id,
+            base_model_name_or_path=configs.base_model,
+        )
+        llm.add_peft(lora_inference_config)
+    if len(configs.finetuning_dataset) > 0:
+        # lora_finetuning_config = ff.LoraLinearConfig(
+        #     llm.cache_path,
+        #     configs.finetuning_peft_model_id,
+        #     target_modules=["down_proj"],
+        #     rank=16,
+        #     lora_alpha=16,
+        #     trainable=True,
+        #     init_lora_weights=True,
+        #     optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+        # )
+        lora_finetuning_config = ff.LoraLinearConfig(
+            llm.cache_path,
+            configs.inference_peft_model_id,
+            trainable=True,
+            base_model_name_or_path=configs.base_model,
+            optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+            optimizer_kwargs={
+                "learning_rate": 0.001,
+                "momentum": 0.0,
+                "weight_decay": 0.0,
+                "nesterov": False,
+            },
+        )
+        llm.add_peft(lora_finetuning_config)
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+    llm.compile(
+        generation_config,
+        enable_peft_finetuning=enable_peft_finetuning,
+        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
+        max_seq_length=256,
+        max_tokens_per_batch=128,
+    )
+
+    llm.start_server()
+
+    requests = []
+    # Serving
+    if len(configs.prompt) > 0:
+        prompts = [s for s in json.load(open(configs.prompt))]
+        inference_requests = [
+            ff.Request(
+                ff.RequestType.REQ_INFERENCE,
+                prompt=prompt,
+                max_sequence_length=128,
+                peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+            )
+            for prompt in prompts
+        ]
+        requests += inference_requests
+    # Finetuning
+    if len(configs.finetuning_dataset) > 0:
+        finetuning_request = ff.Request(
+            ff.RequestType.REQ_FINETUNING,
+            max_sequence_length=128,
+            peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
+            dataset_filepath=configs.finetuning_dataset,
+            max_training_steps=2,
+        )
+        requests.append(finetuning_request)
+
+    results = llm.generate(requests)
+
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow PEFT example")
+    main()
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 05599ea6b9..f888982f2c 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md
new file mode 100644
index 0000000000..9b2a7a53b2
--- /dev/null
+++ b/inference/python/peft_demo/INSTRUCTIONS.md
@@ -0,0 +1,25 @@
+## Peft Demo
+* `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git`
+* `cd FlexFlow/`
+
+* If you wish to run the demo by installing FlexFlow
+    * `conda env create -f conda/flexflow.yml`
+    * `conda activate flexflow`
+
+* If you wish to run the demo using a Docker container
+    * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow`
+
+* Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token)
+
+    * `export HUGGINGFACE_TOKEN="[Your token]"`
+    * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"`
+    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"`
+
+* Run the demo
+    ```
+    mkdir inference/output
+    cd inference/python/peft_demo/
+    python3 demo.py -config-file demo_config.json
+    ```
+
+
diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb
new file mode 100644
index 0000000000..dfb5193a1d
--- /dev/null
+++ b/inference/python/peft_demo/demo.ipynb
@@ -0,0 +1,1907 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FlexFlow Co-Serving Demo\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json, random, subprocess, os\n",
+    "from datasets import load_dataset\n",
+    "from types import SimpleNamespace\n",
+    "from huggingface_hub import HfFolder\n",
+    "import flexflow.serve as ff\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):\n",
+    "    \"\"\"Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.\n",
+    "    Only the 'open_qa' and 'closed_qa' prompts without context are kept.\n",
+    "    The datasets are saved into the files given as arguments.\n",
+    "\n",
+    "    Keyword arguments:\n",
+    "    dataset_size -- the number of prompts to consider\n",
+    "    inference_file_path -- the file in which to save the inference data\n",
+    "    finetuning_file_path -- the file in which to save the finetuning data\n",
+    "    \"\"\"\n",
+    "    dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n",
+    "    inference_data = []\n",
+    "    finetuning_data = []\n",
+    "    for row in dataset:\n",
+    "        if len(finetuning_data) == finetune_dataset_size:\n",
+    "            break\n",
+    "        if (\"open_qa\" in row['category'] or \"closed_qa\" in row['category']) and len(row['context']) == 0:\n",
+    "            inference_data.append(row['instruction'])\n",
+    "            finetuning_data.append(row['instruction'] + \" \" + row['response'])\n",
+    "    with open(inference_file_path, 'w') as file:\n",
+    "        json.dump(inference_data[:1], file)\n",
+    "    with open(finetuning_file_path, 'w') as file:\n",
+    "        json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configs_dict = {\n",
+    "    \"num_gpus\": 1,\n",
+    "    \"memory_per_gpu\": 21000,\n",
+    "    \"zero_copy_memory_per_node\": 40000,\n",
+    "    \"num_cpus\": 4,\n",
+    "    \"legion_utility_processors\": 4,\n",
+    "    \"data_parallelism_degree\": 1,\n",
+    "    \"tensor_parallelism_degree\": 1,\n",
+    "    \"pipeline_parallelism_degree\": 1,\n",
+    "    \"offload\": False,\n",
+    "    \"offload_reserve_space_size\": 8 * 1024,  # 8GB\n",
+    "    \"use_4bit_quantization\": False,\n",
+    "    \"use_8bit_quantization\": False,\n",
+    "    \"enable_peft\": True,\n",
+    "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"profiling\": False,\n",
+    "    \"inference_debugging\": False,\n",
+    "    \"fusion\": False,\n",
+    "    \"max_requests_per_batch\": 1,\n",
+    "    \"max_sequence_length\": 128,\n",
+    "    \"max_tokens_per_batch\": 128,\n",
+    "    \"max_training_steps\": 100,\n",
+    "    \"seed\": 42,\n",
+    "}\n",
+    "model_configs = {\n",
+    "    \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n",
+    "    \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n",
+    "    \"refresh_cache\": False,\n",
+    "    \"full_precision\": False,\n",
+    "    # relative paths\n",
+    "    \"inference_dataset\": \"inference_dataset.json\",\n",
+    "    \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n",
+    "    \"output_file\": \"peft_demo.txt\",\n",
+    "}\n",
+    "generation_configs = {\n",
+    "    \"do_sample\": False,\n",
+    "    \"temperature\": 0.9,\n",
+    "    \"topp\": 0.8,\n",
+    "    \"topk\": 1,\n",
+    "}\n",
+    "finetuning_configs = {\n",
+    "    \"learning_rate\": 0.001,\n",
+    "    \"momentum\": 0.0,\n",
+    "    \"weight_decay\": 0.0,\n",
+    "    \"nesterov\": False,\n",
+    "}\n",
+    "# Merge dictionaries\n",
+    "configs_dict.update(model_configs)\n",
+    "configs_dict.update(generation_configs)\n",
+    "configs_dict.update(finetuning_configs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(configs_dict[\"seed\"])\n",
+    "\n",
+    "configs = SimpleNamespace(**configs_dict)\n",
+    "\n",
+    "create_datasets(inference_file_path=configs_dict[\"inference_dataset\"], \n",
+    "                finetuning_file_path=configs_dict[\"finetuning_dataset\"])\n",
+    "\n",
+    "# Clear output file\n",
+    "with open(configs.output_file, 'w') as file:\n",
+    "    file.write('')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download base and peft inference models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Loading tokenizer...\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Loading tokenizer...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CompletedProcess(args=['python', '../../utils/download_peft_model.py', 'goliaro/llama-3-8b-lora', '--base_model_name', 'meta-llama/Meta-Llama-3-8B'], returncode=0)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n",
+    "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize FlexFlow runtime and LLM object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0 - 7f4d49d21280]    0.672934 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.672995 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.673107 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.673118 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.673124 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "workSpaceSize (128 MB)\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Loading tokenizer...\n",
+      "Adding layer layers.0.mlp.down_proj.lora\n",
+      "Adding layer layers.1.mlp.down_proj.lora\n",
+      "Adding layer layers.2.mlp.down_proj.lora\n",
+      "Adding layer layers.3.mlp.down_proj.lora\n",
+      "Adding layer layers.4.mlp.down_proj.lora\n",
+      "Adding layer layers.5.mlp.down_proj.lora\n",
+      "Adding layer layers.6.mlp.down_proj.lora\n",
+      "Adding layer layers.7.mlp.down_proj.lora\n",
+      "Adding layer layers.8.mlp.down_proj.lora\n",
+      "Adding layer layers.9.mlp.down_proj.lora\n",
+      "Adding layer layers.10.mlp.down_proj.lora\n",
+      "Adding layer layers.11.mlp.down_proj.lora\n",
+      "Adding layer layers.12.mlp.down_proj.lora\n",
+      "Adding layer layers.13.mlp.down_proj.lora\n",
+      "Adding layer layers.14.mlp.down_proj.lora\n",
+      "Adding layer layers.15.mlp.down_proj.lora\n",
+      "Adding layer layers.16.mlp.down_proj.lora\n",
+      "Adding layer layers.17.mlp.down_proj.lora\n",
+      "Adding layer layers.18.mlp.down_proj.lora\n",
+      "Adding layer layers.19.mlp.down_proj.lora\n",
+      "Adding layer layers.20.mlp.down_proj.lora\n",
+      "Adding layer layers.21.mlp.down_proj.lora\n",
+      "Adding layer layers.22.mlp.down_proj.lora\n",
+      "Adding layer layers.23.mlp.down_proj.lora\n",
+      "Adding layer layers.24.mlp.down_proj.lora\n",
+      "Adding layer layers.25.mlp.down_proj.lora\n",
+      "Adding layer layers.26.mlp.down_proj.lora\n",
+      "Adding layer layers.27.mlp.down_proj.lora\n",
+      "Adding layer layers.28.mlp.down_proj.lora\n",
+      "Adding layer layers.29.mlp.down_proj.lora\n",
+      "Adding layer layers.30.mlp.down_proj.lora\n",
+      "Adding layer layers.31.mlp.down_proj.lora\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n",
+    "ff.init(configs_dict)\n",
+    "\n",
+    "# Create the FlexFlow LLM\n",
+    "ff_data_type = (\n",
+    "    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n",
+    ")\n",
+    "llm = ff.LLM(\n",
+    "    configs.base_model,\n",
+    "    data_type=ff_data_type,\n",
+    "    cache_path=configs.cache_path,\n",
+    "    refresh_cache=configs.refresh_cache,\n",
+    "    output_file=configs.output_file,\n",
+    ")\n",
+    "# Add inference and/or finetuning lora\n",
+    "lora_inference_config = None\n",
+    "lora_finetuning_config = None\n",
+    "if len(configs.inference_dataset) > 0:\n",
+    "    lora_inference_config = ff.LoraLinearConfig(\n",
+    "        llm.cache_path, \n",
+    "        configs.inference_peft_model_id,\n",
+    "        base_model_name_or_path=configs.base_model\n",
+    "    )\n",
+    "    llm.add_peft(lora_inference_config)\n",
+    "if len(configs.finetuning_dataset) > 0:\n",
+    "    lora_finetuning_config = ff.LoraLinearConfig(\n",
+    "        llm.cache_path,\n",
+    "        configs.finetuning_peft_model_id,\n",
+    "        trainable=True,\n",
+    "        init_lora_weights=False,\n",
+    "        rank=16,\n",
+    "        lora_alpha=16.0,\n",
+    "        # target_modules = [\"down_proj\"],\n",
+    "        base_model_name_or_path=configs.base_model,\n",
+    "        optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,\n",
+    "        optimizer_kwargs={\n",
+    "            \"learning_rate\": configs.learning_rate,\n",
+    "            \"momentum\": configs.momentum,\n",
+    "            \"weight_decay\": configs.weight_decay,\n",
+    "            \"nesterov\": configs.nesterov,\n",
+    "        },\n",
+    "    )\n",
+    "    llm.add_peft(lora_finetuning_config)\n",
+    "\n",
+    "# Compile the LLM for inference and load the weights into memory\n",
+    "generation_config = ff.GenerationConfig(\n",
+    "    do_sample=configs.do_sample,\n",
+    "    temperature=configs.temperature,\n",
+    "    topp=configs.topp,\n",
+    "    topk=configs.topk\n",
+    ")\n",
+    "enable_peft_finetuning = len(configs.finetuning_dataset) > 0\n",
+    "llm.compile(\n",
+    "    generation_config,\n",
+    "    enable_peft_finetuning=enable_peft_finetuning,\n",
+    "    max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),\n",
+    "    max_seq_length=configs.max_sequence_length,\n",
+    "    max_tokens_per_batch=configs.max_tokens_per_batch,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Start the LLM Co-serving system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Background server started.\n",
+      "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Starting background serving task.\n",
+      "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Updated models' configuration.\n",
+      "###PEFT DEBUGGING### LLM Model object exists.\n",
+      "###PEFT DEBUGGING### Model object exists.\n",
+      "###PEFT DEBUGGING### Model object still exists.\n",
+      "###PEFT DEBUGGING### Entering compile_inference.\n",
+      "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm.start_server()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###PEFT DEBUGGING### Launching graph optimization task.\n",
+      "[<flexflow.core.flexflow_cffi.Request object at 0x7f4ce8e13250>]\n",
+      "num_nodes = 1 num_gpus_per_node = 1\n",
+      "[0]10445\n",
+      "[1]649\n",
+      "[2]6730\n",
+      "[3]2053\n",
+      "[4]18167\n",
+      "[5]369\n",
+      "[6]1317\n",
+      "[7]2085\n",
+      "[8]3090\n",
+      "[9]30\n",
+      "No small speculative model registered, using incremental decoding.\n",
+      "[0 - 7f4d49d21280]    1.600215 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n",
+      "optimal_views.size = 262\n",
+      "views.size() = 262\n",
+      "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n",
+      "###PEFT DEBUGGING### Starting inplace optimizations.\n",
+      "###PEFT DEBUGGING### Mapping output tensors.\n",
+      "ndim(1) dims[1 0 0 0]\n",
+      "###PEFT DEBUGGING### Setting up NCCL communications.\n",
+      "###PEFT DEBUGGING### compile_inference completed successfully.\n",
+      "Loading weight file embed_tokens.weight\n",
+      "Loading weight file layers.0.input_layernorm.weight\n",
+      "Loading weight file layers.0.self_attn.q_proj.weight\n",
+      "Loading weight file layers.0.self_attn.k_proj.weight\n",
+      "Loading weight file layers.0.self_attn.v_proj.weight\n",
+      "Loading weight file layers.0.self_attn.o_proj.weight\n",
+      "Loading weight file layers.0.post_attention_layernorm.weight\n",
+      "Loading weight file layers.0.mlp.gate_proj.weight\n",
+      "Loading weight file layers.0.mlp.up_proj.weight\n",
+      "Loading weight file layers.0.mlp.down_proj.weight\n",
+      "Loading weight file layers.1.input_layernorm.weight\n",
+      "Loading weight file layers.1.self_attn.q_proj.weight\n",
+      "Loading weight file layers.1.self_attn.k_proj.weight\n",
+      "Loading weight file layers.1.self_attn.v_proj.weight\n",
+      "Loading weight file layers.1.self_attn.o_proj.weight\n",
+      "Loading weight file layers.1.post_attention_layernorm.weight\n",
+      "Loading weight file layers.1.mlp.gate_proj.weight\n",
+      "Loading weight file layers.1.mlp.up_proj.weight\n",
+      "Loading weight file layers.1.mlp.down_proj.weight\n",
+      "Loading weight file layers.2.input_layernorm.weight\n",
+      "Loading weight file layers.2.self_attn.q_proj.weight\n",
+      "Loading weight file layers.2.self_attn.k_proj.weight\n",
+      "Loading weight file layers.2.self_attn.v_proj.weight\n",
+      "Loading weight file layers.2.self_attn.o_proj.weight\n",
+      "Loading weight file layers.2.post_attention_layernorm.weight\n",
+      "Loading weight file layers.2.mlp.gate_proj.weight\n",
+      "Loading weight file layers.2.mlp.up_proj.weight\n",
+      "Loading weight file layers.2.mlp.down_proj.weight\n",
+      "Loading weight file layers.3.input_layernorm.weight\n",
+      "Loading weight file layers.3.self_attn.q_proj.weight\n",
+      "Loading weight file layers.3.self_attn.k_proj.weight\n",
+      "Loading weight file layers.3.self_attn.v_proj.weight\n",
+      "Loading weight file layers.3.self_attn.o_proj.weight\n",
+      "Loading weight file layers.3.post_attention_layernorm.weight\n",
+      "Loading weight file layers.3.mlp.gate_proj.weight\n",
+      "Loading weight file layers.3.mlp.up_proj.weight\n",
+      "Loading weight file layers.3.mlp.down_proj.weight\n",
+      "Loading weight file layers.4.input_layernorm.weight\n",
+      "Loading weight file layers.4.self_attn.q_proj.weight\n",
+      "Loading weight file layers.4.self_attn.k_proj.weight\n",
+      "Loading weight file layers.4.self_attn.v_proj.weight\n",
+      "Loading weight file layers.4.self_attn.o_proj.weight\n",
+      "Loading weight file layers.4.post_attention_layernorm.weight\n",
+      "Loading weight file layers.4.mlp.gate_proj.weight\n",
+      "Loading weight file layers.4.mlp.up_proj.weight\n",
+      "Loading weight file layers.4.mlp.down_proj.weight\n",
+      "Loading weight file layers.5.input_layernorm.weight\n",
+      "Loading weight file layers.5.self_attn.q_proj.weight\n",
+      "Loading weight file layers.5.self_attn.k_proj.weight\n",
+      "Loading weight file layers.5.self_attn.v_proj.weight\n",
+      "Loading weight file layers.5.self_attn.o_proj.weight\n",
+      "Loading weight file layers.5.post_attention_layernorm.weight\n",
+      "Loading weight file layers.5.mlp.gate_proj.weight\n",
+      "Loading weight file layers.5.mlp.up_proj.weight\n",
+      "Loading weight file layers.5.mlp.down_proj.weight\n",
+      "Loading weight file layers.6.input_layernorm.weight\n",
+      "Loading weight file layers.6.self_attn.q_proj.weight\n",
+      "Loading weight file layers.6.self_attn.k_proj.weight\n",
+      "Loading weight file layers.6.self_attn.v_proj.weight\n",
+      "Loading weight file layers.6.self_attn.o_proj.weight\n",
+      "Loading weight file layers.6.post_attention_layernorm.weight\n",
+      "Loading weight file layers.6.mlp.gate_proj.weight\n",
+      "Loading weight file layers.6.mlp.up_proj.weight\n",
+      "Loading weight file layers.6.mlp.down_proj.weight\n",
+      "Loading weight file layers.7.input_layernorm.weight\n",
+      "Loading weight file layers.7.self_attn.q_proj.weight\n",
+      "Loading weight file layers.7.self_attn.k_proj.weight\n",
+      "Loading weight file layers.7.self_attn.v_proj.weight\n",
+      "Loading weight file layers.7.self_attn.o_proj.weight\n",
+      "Loading weight file layers.7.post_attention_layernorm.weight\n",
+      "Loading weight file layers.7.mlp.gate_proj.weight\n",
+      "Loading weight file layers.7.mlp.up_proj.weight\n",
+      "Loading weight file layers.7.mlp.down_proj.weight\n",
+      "Loading weight file layers.8.input_layernorm.weight\n",
+      "Loading weight file layers.8.self_attn.q_proj.weight\n",
+      "Loading weight file layers.8.self_attn.k_proj.weight\n",
+      "Loading weight file layers.8.self_attn.v_proj.weight\n",
+      "Loading weight file layers.8.self_attn.o_proj.weight\n",
+      "Loading weight file layers.8.post_attention_layernorm.weight\n",
+      "Loading weight file layers.8.mlp.gate_proj.weight\n",
+      "Loading weight file layers.8.mlp.up_proj.weight\n",
+      "Loading weight file layers.8.mlp.down_proj.weight\n",
+      "Loading weight file layers.9.input_layernorm.weight\n",
+      "Loading weight file layers.9.self_attn.q_proj.weight\n",
+      "Loading weight file layers.9.self_attn.k_proj.weight\n",
+      "Loading weight file layers.9.self_attn.v_proj.weight\n",
+      "Loading weight file layers.9.self_attn.o_proj.weight\n",
+      "Loading weight file layers.9.post_attention_layernorm.weight\n",
+      "Loading weight file layers.9.mlp.gate_proj.weight\n",
+      "Loading weight file layers.9.mlp.up_proj.weight\n",
+      "Loading weight file layers.9.mlp.down_proj.weight\n",
+      "Loading weight file layers.10.input_layernorm.weight\n",
+      "Loading weight file layers.10.self_attn.q_proj.weight\n",
+      "Loading weight file layers.10.self_attn.k_proj.weight\n",
+      "Loading weight file layers.10.self_attn.v_proj.weight\n",
+      "Loading weight file layers.10.self_attn.o_proj.weight\n",
+      "Loading weight file layers.10.post_attention_layernorm.weight\n",
+      "Loading weight file layers.10.mlp.gate_proj.weight\n",
+      "Loading weight file layers.10.mlp.up_proj.weight\n",
+      "Loading weight file layers.10.mlp.down_proj.weight\n",
+      "Loading weight file layers.11.input_layernorm.weight\n",
+      "Loading weight file layers.11.self_attn.q_proj.weight\n",
+      "Loading weight file layers.11.self_attn.k_proj.weight\n",
+      "Loading weight file layers.11.self_attn.v_proj.weight\n",
+      "Loading weight file layers.11.self_attn.o_proj.weight\n",
+      "Loading weight file layers.11.post_attention_layernorm.weight\n",
+      "Loading weight file layers.11.mlp.gate_proj.weight\n",
+      "Loading weight file layers.11.mlp.up_proj.weight\n",
+      "Loading weight file layers.11.mlp.down_proj.weight\n",
+      "Loading weight file layers.12.input_layernorm.weight\n",
+      "Loading weight file layers.12.self_attn.q_proj.weight\n",
+      "Loading weight file layers.12.self_attn.k_proj.weight\n",
+      "Loading weight file layers.12.self_attn.v_proj.weight\n",
+      "Loading weight file layers.12.self_attn.o_proj.weight\n",
+      "Loading weight file layers.12.post_attention_layernorm.weight\n",
+      "Loading weight file layers.12.mlp.gate_proj.weight\n",
+      "Loading weight file layers.12.mlp.up_proj.weight\n",
+      "Loading weight file layers.12.mlp.down_proj.weight\n",
+      "Loading weight file layers.13.input_layernorm.weight\n",
+      "Loading weight file layers.13.self_attn.q_proj.weight\n",
+      "Loading weight file layers.13.self_attn.k_proj.weight\n",
+      "Loading weight file layers.13.self_attn.v_proj.weight\n",
+      "Loading weight file layers.13.self_attn.o_proj.weight\n",
+      "Loading weight file layers.13.post_attention_layernorm.weight\n",
+      "Loading weight file layers.13.mlp.gate_proj.weight\n",
+      "Loading weight file layers.13.mlp.up_proj.weight\n",
+      "Loading weight file layers.13.mlp.down_proj.weight\n",
+      "Loading weight file layers.14.input_layernorm.weight\n",
+      "Loading weight file layers.14.self_attn.q_proj.weight\n",
+      "Loading weight file layers.14.self_attn.k_proj.weight\n",
+      "Loading weight file layers.14.self_attn.v_proj.weight\n",
+      "Loading weight file layers.14.self_attn.o_proj.weight\n",
+      "Loading weight file layers.14.post_attention_layernorm.weight\n",
+      "Loading weight file layers.14.mlp.gate_proj.weight\n",
+      "Loading weight file layers.14.mlp.up_proj.weight\n",
+      "Loading weight file layers.14.mlp.down_proj.weight\n",
+      "Loading weight file layers.15.input_layernorm.weight\n",
+      "Loading weight file layers.15.self_attn.q_proj.weight\n",
+      "Loading weight file layers.15.self_attn.k_proj.weight\n",
+      "Loading weight file layers.15.self_attn.v_proj.weight\n",
+      "Loading weight file layers.15.self_attn.o_proj.weight\n",
+      "Loading weight file layers.15.post_attention_layernorm.weight\n",
+      "Loading weight file layers.15.mlp.gate_proj.weight\n",
+      "Loading weight file layers.15.mlp.up_proj.weight\n",
+      "Loading weight file layers.15.mlp.down_proj.weight\n",
+      "Loading weight file layers.16.input_layernorm.weight\n",
+      "Loading weight file layers.16.self_attn.q_proj.weight\n",
+      "Loading weight file layers.16.self_attn.k_proj.weight\n",
+      "Loading weight file layers.16.self_attn.v_proj.weight\n",
+      "Loading weight file layers.16.self_attn.o_proj.weight\n",
+      "Loading weight file layers.16.post_attention_layernorm.weight\n",
+      "Loading weight file layers.16.mlp.gate_proj.weight\n",
+      "Loading weight file layers.16.mlp.up_proj.weight\n",
+      "Loading weight file layers.16.mlp.down_proj.weight\n",
+      "Loading weight file layers.17.input_layernorm.weight\n",
+      "Loading weight file layers.17.self_attn.q_proj.weight\n",
+      "Loading weight file layers.17.self_attn.k_proj.weight\n",
+      "Loading weight file layers.17.self_attn.v_proj.weight\n",
+      "Loading weight file layers.17.self_attn.o_proj.weight\n",
+      "Loading weight file layers.17.post_attention_layernorm.weight\n",
+      "Loading weight file layers.17.mlp.gate_proj.weight\n",
+      "Loading weight file layers.17.mlp.up_proj.weight\n",
+      "Loading weight file layers.17.mlp.down_proj.weight\n",
+      "Loading weight file layers.18.input_layernorm.weight\n",
+      "Loading weight file layers.18.self_attn.q_proj.weight\n",
+      "Loading weight file layers.18.self_attn.k_proj.weight\n",
+      "Loading weight file layers.18.self_attn.v_proj.weight\n",
+      "Loading weight file layers.18.self_attn.o_proj.weight\n",
+      "Loading weight file layers.18.post_attention_layernorm.weight\n",
+      "Loading weight file layers.18.mlp.gate_proj.weight\n",
+      "Loading weight file layers.18.mlp.up_proj.weight\n",
+      "Loading weight file layers.18.mlp.down_proj.weight\n",
+      "Loading weight file layers.19.input_layernorm.weight\n",
+      "Loading weight file layers.19.self_attn.q_proj.weight\n",
+      "Loading weight file layers.19.self_attn.k_proj.weight\n",
+      "Loading weight file layers.19.self_attn.v_proj.weight\n",
+      "Loading weight file layers.19.self_attn.o_proj.weight\n",
+      "Loading weight file layers.19.post_attention_layernorm.weight\n",
+      "Loading weight file layers.19.mlp.gate_proj.weight\n",
+      "Loading weight file layers.19.mlp.up_proj.weight\n",
+      "Loading weight file layers.19.mlp.down_proj.weight\n",
+      "Loading weight file layers.20.input_layernorm.weight\n",
+      "Loading weight file layers.20.self_attn.q_proj.weight\n",
+      "Loading weight file layers.20.self_attn.k_proj.weight\n",
+      "Loading weight file layers.20.self_attn.v_proj.weight\n",
+      "Loading weight file layers.20.self_attn.o_proj.weight\n",
+      "Loading weight file layers.20.post_attention_layernorm.weight\n",
+      "Loading weight file layers.20.mlp.gate_proj.weight\n",
+      "Loading weight file layers.20.mlp.up_proj.weight\n",
+      "Loading weight file layers.20.mlp.down_proj.weight\n",
+      "Loading weight file layers.21.input_layernorm.weight\n",
+      "Loading weight file layers.21.self_attn.q_proj.weight\n",
+      "Loading weight file layers.21.self_attn.k_proj.weight\n",
+      "Loading weight file layers.21.self_attn.v_proj.weight\n",
+      "Loading weight file layers.21.self_attn.o_proj.weight\n",
+      "Loading weight file layers.21.post_attention_layernorm.weight\n",
+      "Loading weight file layers.21.mlp.gate_proj.weight\n",
+      "Loading weight file layers.21.mlp.up_proj.weight\n",
+      "Loading weight file layers.21.mlp.down_proj.weight\n",
+      "Loading weight file layers.22.input_layernorm.weight\n",
+      "Loading weight file layers.22.self_attn.q_proj.weight\n",
+      "Loading weight file layers.22.self_attn.k_proj.weight\n",
+      "Loading weight file layers.22.self_attn.v_proj.weight\n",
+      "Loading weight file layers.22.self_attn.o_proj.weight\n",
+      "Loading weight file layers.22.post_attention_layernorm.weight\n",
+      "Loading weight file layers.22.mlp.gate_proj.weight\n",
+      "Loading weight file layers.22.mlp.up_proj.weight\n",
+      "Loading weight file layers.22.mlp.down_proj.weight\n",
+      "Loading weight file layers.23.input_layernorm.weight\n",
+      "Loading weight file layers.23.self_attn.q_proj.weight\n",
+      "Loading weight file layers.23.self_attn.k_proj.weight\n",
+      "Loading weight file layers.23.self_attn.v_proj.weight\n",
+      "Loading weight file layers.23.self_attn.o_proj.weight\n",
+      "Loading weight file layers.23.post_attention_layernorm.weight\n",
+      "Loading weight file layers.23.mlp.gate_proj.weight\n",
+      "Loading weight file layers.23.mlp.up_proj.weight\n",
+      "Loading weight file layers.23.mlp.down_proj.weight\n",
+      "Loading weight file layers.24.input_layernorm.weight\n",
+      "Loading weight file layers.24.self_attn.q_proj.weight\n",
+      "Loading weight file layers.24.self_attn.k_proj.weight\n",
+      "Loading weight file layers.24.self_attn.v_proj.weight\n",
+      "Loading weight file layers.24.self_attn.o_proj.weight\n",
+      "Loading weight file layers.24.post_attention_layernorm.weight\n",
+      "Loading weight file layers.24.mlp.gate_proj.weight\n",
+      "Loading weight file layers.24.mlp.up_proj.weight\n",
+      "Loading weight file layers.24.mlp.down_proj.weight\n",
+      "Loading weight file layers.25.input_layernorm.weight\n",
+      "Loading weight file layers.25.self_attn.q_proj.weight\n",
+      "Loading weight file layers.25.self_attn.k_proj.weight\n",
+      "Loading weight file layers.25.self_attn.v_proj.weight\n",
+      "Loading weight file layers.25.self_attn.o_proj.weight\n",
+      "Loading weight file layers.25.post_attention_layernorm.weight\n",
+      "Loading weight file layers.25.mlp.gate_proj.weight\n",
+      "Loading weight file layers.25.mlp.up_proj.weight\n",
+      "Loading weight file layers.25.mlp.down_proj.weight\n",
+      "Loading weight file layers.26.input_layernorm.weight\n",
+      "Loading weight file layers.26.self_attn.q_proj.weight\n",
+      "Loading weight file layers.26.self_attn.k_proj.weight\n",
+      "Loading weight file layers.26.self_attn.v_proj.weight\n",
+      "Loading weight file layers.26.self_attn.o_proj.weight\n",
+      "Loading weight file layers.26.post_attention_layernorm.weight\n",
+      "Loading weight file layers.26.mlp.gate_proj.weight\n",
+      "Loading weight file layers.26.mlp.up_proj.weight\n",
+      "Loading weight file layers.26.mlp.down_proj.weight\n",
+      "Loading weight file layers.27.input_layernorm.weight\n",
+      "Loading weight file layers.27.self_attn.q_proj.weight\n",
+      "Loading weight file layers.27.self_attn.k_proj.weight\n",
+      "Loading weight file layers.27.self_attn.v_proj.weight\n",
+      "Loading weight file layers.27.self_attn.o_proj.weight\n",
+      "Loading weight file layers.27.post_attention_layernorm.weight\n",
+      "Loading weight file layers.27.mlp.gate_proj.weight\n",
+      "Loading weight file layers.27.mlp.up_proj.weight\n",
+      "Loading weight file layers.27.mlp.down_proj.weight\n",
+      "Loading weight file layers.28.input_layernorm.weight\n",
+      "Loading weight file layers.28.self_attn.q_proj.weight\n",
+      "Loading weight file layers.28.self_attn.k_proj.weight\n",
+      "Loading weight file layers.28.self_attn.v_proj.weight\n",
+      "Loading weight file layers.28.self_attn.o_proj.weight\n",
+      "Loading weight file layers.28.post_attention_layernorm.weight\n",
+      "Loading weight file layers.28.mlp.gate_proj.weight\n",
+      "Loading weight file layers.28.mlp.up_proj.weight\n",
+      "Loading weight file layers.28.mlp.down_proj.weight\n",
+      "Loading weight file layers.29.input_layernorm.weight\n",
+      "Loading weight file layers.29.self_attn.q_proj.weight\n",
+      "Loading weight file layers.29.self_attn.k_proj.weight\n",
+      "Loading weight file layers.29.self_attn.v_proj.weight\n",
+      "Loading weight file layers.29.self_attn.o_proj.weight\n",
+      "Loading weight file layers.29.post_attention_layernorm.weight\n",
+      "Loading weight file layers.29.mlp.gate_proj.weight\n",
+      "Loading weight file layers.29.mlp.up_proj.weight\n",
+      "Loading weight file layers.29.mlp.down_proj.weight\n",
+      "Loading weight file layers.30.input_layernorm.weight\n",
+      "Loading weight file layers.30.self_attn.q_proj.weight\n",
+      "Loading weight file layers.30.self_attn.k_proj.weight\n",
+      "Loading weight file layers.30.self_attn.v_proj.weight\n",
+      "Loading weight file layers.30.self_attn.o_proj.weight\n",
+      "Loading weight file layers.30.post_attention_layernorm.weight\n",
+      "Loading weight file layers.30.mlp.gate_proj.weight\n",
+      "Loading weight file layers.30.mlp.up_proj.weight\n",
+      "Loading weight file layers.30.mlp.down_proj.weight\n",
+      "Loading weight file layers.31.input_layernorm.weight\n",
+      "Loading weight file layers.31.self_attn.q_proj.weight\n",
+      "Loading weight file layers.31.self_attn.k_proj.weight\n",
+      "Loading weight file layers.31.self_attn.v_proj.weight\n",
+      "Loading weight file layers.31.self_attn.o_proj.weight\n",
+      "Loading weight file layers.31.post_attention_layernorm.weight\n",
+      "Loading weight file layers.31.mlp.gate_proj.weight\n",
+      "Loading weight file layers.31.mlp.up_proj.weight\n",
+      "Loading weight file layers.31.mlp.down_proj.weight\n",
+      "Loading weight file norm.weight\n",
+      "Loading weight file lm_head.weight\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "[0 - 7f4ce019c740]   24.015346 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0178740]   24.062661 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0190740]   24.128376 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0184740]   24.199797 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   24.255941 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0178740]   24.306545 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   24.357210 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0190740]   24.407958 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0178740]   24.459366 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0178740]   24.510618 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0178740]   24.560416 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0178740]   24.611335 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0178740]   24.663808 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0178740]   24.710965 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0178740]   24.756020 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   24.805719 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0178740]   24.858560 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   24.910607 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0178740]   24.958879 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   25.002851 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   25.050780 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   25.104554 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0184740]   25.159509 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   25.211003 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   25.261411 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0190740]   25.312357 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0184740]   25.362253 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0184740]   25.412284 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0184740]   25.461502 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0184740]   25.513610 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   25.564433 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0184740]   25.613662 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0184740]   25.663786 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0184740]   25.712708 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   25.762206 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0184740]   25.812755 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   25.863367 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0184740]   25.913378 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0184740]   25.965063 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0178740]   26.015739 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   26.065768 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0178740]   26.115556 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0184740]   26.166644 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0184740]   26.218528 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0178740]   26.269681 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0178740]   26.320250 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0178740]   26.371698 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   26.422587 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0178740]   26.474391 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   26.524817 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0190740]   26.575224 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0178740]   26.627207 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0190740]   26.679366 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0178740]   26.729921 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   26.779766 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   26.832104 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0184740]   26.884087 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   26.935580 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   26.992909 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0184740]   27.043722 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0184740]   27.093960 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0178740]   27.144937 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0190740]   27.196991 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0178740]   27.248143 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0190740]   27.299549 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0190740]   27.351395 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   27.402975 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0190740]   27.453662 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0178740]   27.504152 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0178740]   27.554072 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   27.605613 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   27.656807 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0190740]   27.707595 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0190740]   27.757815 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   27.809557 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   27.862148 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0190740]   27.914188 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0178740]   27.965942 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0184740]   28.017837 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0184740]   28.069997 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0184740]   28.122560 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0190740]   28.172513 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0190740]   28.224002 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0184740]   28.276536 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0184740]   28.327091 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   28.377124 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0190740]   28.427226 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0190740]   28.477499 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0184740]   28.528489 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   28.580135 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0190740]   28.631761 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   28.683392 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   28.734001 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0190740]   28.783914 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0190740]   28.835832 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0184740]   28.885271 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0190740]   28.936179 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0190740]   28.987163 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   29.038264 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0184740]   29.084248 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0184740]   29.129864 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0184740]   29.175946 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   29.226707 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0184740]   29.277372 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   29.329588 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   29.380856 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0190740]   29.431483 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0190740]   29.483399 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   29.536268 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0190740]   29.588317 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0184740]   29.638727 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0190740]   29.689708 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0190740]   29.740987 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0178740]   29.791166 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0190740]   29.841776 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   29.893514 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0178740]   29.945509 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   29.945878 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n",
+      "[0 - 7f4ce0178740]   29.945889 {3}{RequestManager}: Final output: <s> <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without\n",
+      "[0 - 7f4ce0178740]   29.945900 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(23696232.0) finish(29945893.0) latency(6249661.0) ttft(22415078.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [s for s in json.load(open(configs.inference_dataset))]\n",
+    "inference_requests = [\n",
+    "    ff.Request(\n",
+    "        ff.RequestType.REQ_INFERENCE,\n",
+    "        prompt=prompt,\n",
+    "        max_sequence_length=configs.max_sequence_length,\n",
+    "        peft_model_id=llm.get_ff_peft_id(lora_inference_config),\n",
+    "    )\n",
+    "    for prompt in prompts\n",
+    "]\n",
+    "inf_req_res_1 = llm.generate(inference_requests)\n",
+    "with open(\"before_finetuning.txt\", \"w\") as file:\n",
+    "    file.write(str(inf_req_res_1[0].output_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Perform Finetuning on dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[<flexflow.core.flexflow_cffi.Request object at 0x7f4ce3304c50>]\n",
+      "No small speculative model registered, using incremental decoding.\n",
+      "[0 - 7f4d49d21280]   29.957050 {3}{RequestManager}: [0] input: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30 8215 2053 1005 279 8834 304 872 305 12055 311 2567 1124 10409 449 4907 323 88000 369 1317 18852 315 892 13\n",
+      "[0 - 7f4d49d21280]   29.957061 {3}{RequestManager}: [0] output:\n",
+      "Loss: 2.6536\n",
+      "Loss: 2.5942\n",
+      "Loss: 2.5360\n",
+      "Loss: 2.5083\n",
+      "Loss: 2.4783\n",
+      "Loss: 2.4570\n",
+      "Loss: 2.4420\n",
+      "Loss: 2.4194\n",
+      "Loss: 2.4050\n",
+      "Loss: 2.3949\n",
+      "Loss: 2.3841\n",
+      "Loss: 2.3764\n",
+      "Loss: 2.3676\n",
+      "Loss: 2.3535\n",
+      "Loss: 2.3396\n",
+      "Loss: 2.3299\n",
+      "Loss: 2.3287\n",
+      "Loss: 2.3215\n",
+      "Loss: 2.3058\n",
+      "Loss: 2.2978\n",
+      "Loss: 2.2885\n",
+      "Loss: 2.2852\n",
+      "Loss: 2.2660\n",
+      "Loss: 2.2619\n",
+      "Loss: 2.2594\n",
+      "Loss: 2.2479\n",
+      "Loss: 2.2379\n",
+      "Loss: 2.2243\n",
+      "Loss: 2.2245\n",
+      "Loss: 2.2057\n",
+      "Loss: 2.2035\n",
+      "Loss: 2.1891\n",
+      "Loss: 2.1817\n",
+      "Loss: 2.1703\n",
+      "Loss: 2.1592\n",
+      "Loss: 2.1548\n",
+      "Loss: 2.1383\n",
+      "Loss: 2.1321\n",
+      "Loss: 2.1179\n",
+      "Loss: 2.1138\n",
+      "Loss: 2.1062\n",
+      "Loss: 2.0934\n",
+      "Loss: 2.0856\n",
+      "Loss: 2.0758\n",
+      "Loss: 2.0656\n",
+      "Loss: 2.0532\n",
+      "Loss: 2.0497\n",
+      "Loss: 2.0410\n",
+      "Loss: 2.0258\n",
+      "Loss: 2.0161\n",
+      "Loss: 2.0047\n",
+      "Loss: 1.9940\n",
+      "Loss: 1.9820\n",
+      "Loss: 1.9737\n",
+      "Loss: 1.9614\n",
+      "Loss: 1.9486\n",
+      "Loss: 1.9378\n",
+      "Loss: 1.9281\n",
+      "Loss: 1.9174\n",
+      "Loss: 1.9047\n",
+      "Loss: 1.8922\n",
+      "Loss: 1.8798\n",
+      "Loss: 1.8674\n",
+      "Loss: 1.8574\n",
+      "Loss: 1.8485\n",
+      "Loss: 1.8301\n",
+      "Loss: 1.8213\n",
+      "Loss: 1.8091\n",
+      "Loss: 1.8007\n",
+      "Loss: 1.7850\n",
+      "Loss: 1.7784\n",
+      "Loss: 1.7606\n",
+      "Loss: 1.7496\n",
+      "Loss: 1.7320\n",
+      "Loss: 1.7216\n",
+      "Loss: 1.7067\n",
+      "Loss: 1.6954\n",
+      "Loss: 1.6781\n",
+      "Loss: 1.6667\n",
+      "Loss: 1.6551\n",
+      "Loss: 1.6425\n",
+      "Loss: 1.6272\n",
+      "Loss: 1.6096\n",
+      "Loss: 1.6030\n",
+      "Loss: 1.5824\n",
+      "Loss: 1.5724\n",
+      "Loss: 1.5558\n",
+      "Loss: 1.5399\n",
+      "Loss: 1.5266\n",
+      "Loss: 1.5109\n",
+      "Loss: 1.4952\n",
+      "Loss: 1.4829\n",
+      "Loss: 1.4648\n",
+      "Loss: 1.4496\n",
+      "Loss: 1.4360\n",
+      "Loss: 1.4154\n",
+      "Loss: 1.4010\n",
+      "Loss: 1.3958\n",
+      "Loss: 1.3719\n",
+      "Loss: 1.3562\n",
+      "[0 - 7f4ce0190740]   38.933268 {3}{RequestManager}: [Finetuning] guid(1000001) completed_training_steps(100) processed_finetuning_tokens(3400) latency(38933176.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "finetuning_request = ff.Request(\n",
+    "    ff.RequestType.REQ_FINETUNING,\n",
+    "    max_sequence_length=configs.max_sequence_length,\n",
+    "    peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),\n",
+    "    dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),\n",
+    "    max_training_steps=configs.max_training_steps,\n",
+    ")\n",
+    "ft_res = llm.generate([finetuning_request])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAABm/UlEQVR4nO3de1yUdfr/8fcICKKioqgIJKaVHe1gBw94KA9ZmYpKiqVW+3VLLcnd2tq21O1gWdtWW1m2pZ3QjDTL7UQlHlK3rNztaG1KKmIeERVFGu7fH/dvBoaZYQ4MzAzzej4ePMa5577v+TB+UC8/13V9LIZhGAIAAAAAuNUk2AMAAAAAgFBH4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBACNUEFBgSwWiwoKCoI9lIi3aNEiWSwWbdq0KdhD8cpPP/2kIUOGqFWrVrJYLHrrrbeCPSS/FBYWymKx6NFHHw32UAA0EgROABq1cPhH6znnnKOTTjpJhmG4PadPnz7q0KGDfvvttwYcWfiYPXu2LBaLOnTooLKyMqfX09PTddVVVwVhZOFn0qRJ+vrrr/XAAw/olVdeUc+ePV2eZwtM3H099NBDDTxyAKhf0cEeAABEugkTJujOO+/U2rVr1a9fP6fXCwsLtWHDBk2fPl3R0fyxXZs9e/Zo/vz5+sMf/hDsoYSlY8eOacOGDbr77rs1ffp0r64ZP368rrjiCqfj5513XqCHBwBBxd/AABBk2dnZuuuuu5Sbm+sycFq8eLEMw9CECROCMLrwcu655+qRRx7R1KlT1axZs2APp0EdPXpUzZs3r9M99u7dK0lq3bq119ecf/75uvbaa+v0vgAQDkjVAwBJX331lYYNG6aEhAS1aNFCl112mTZu3OhwTkVFhebMmaNTTjlFcXFxatu2rfr27av8/Hz7Obt379b111+v1NRUxcbGKjk5WSNGjFBhYaHb905LS1O/fv2Ul5eniooKp9dzc3PVtWtXXXzxxfrll180depUnXbaaWrWrJnatm2rsWPH1np/m/T0dE2ePNnp+IABAzRgwACHY+Xl5Zo1a5a6deum2NhYpaWl6Y477lB5eXmt7zF9+nS1aNHCZbrc+PHj1bFjR1mtVknSpk2bNHToULVr107NmjVTly5ddMMNN3j8Pmpz77336tdff9X8+fNrPc9dDZgt/WzRokX2Y5MnT1aLFi20fft2XXXVVWrRooVSUlL09NNPS5K+/vprXXrppWrevLk6d+6s3Nxcl+9ZVlam3//+92rbtq0SEhI0ceJEHTx40Om89957TxkZGWrevLlatmypK6+8Ut9++63DObYx/fzzz7riiivUsmVLj4G1pzk+e/Zsde7cWZJ0++23y2KxKD09vdZ7esuWKvnhhx/q3HPPVVxcnM444wwtW7bM6dytW7dq7NixSkxMVHx8vC655BL961//cjrv+PHjmj17tk499VTFxcUpOTlZmZmZ+vnnn53OXbBggbp27arY2FhdeOGF+vzzzx1e9+fnFkDkYcUJQMT79ttvlZGRoYSEBN1xxx2KiYnRc889pwEDBmj16tW6+OKLJZn/sJw7d65+97vf6aKLLlJpaak2bdqkL7/8UoMHD5YkjR49Wt9++61uueUWpaena8+ePcrPz9f27dtr/UfohAkTNGXKFH3wwQcOtThff/21vvnmG917772SpM8//1zr16/XuHHjlJqaqsLCQs2fP18DBgzQd999p/j4+Dp/HpWVlbr66qu1bt06TZkyRaeffrq+/vpr/f3vf9ePP/5Ya7OAa665Rk8//bT+9a9/aezYsfbjZWVleueddzR58mRFRUVpz549GjJkiJKSknTnnXeqdevWKiwsdPkPaV9kZGTo0ksv1bx583TzzTcHbNXJarVq2LBh6tevn+bNm6fXXntN06dPV/PmzXX33XdrwoQJyszM1LPPPquJEyeqV69e6tKli8M9pk+frtatW2v27NnasmWL5s+fr19++cUexEnSK6+8okmTJmno0KF6+OGHVVZWpvnz56tv37766quvHObQb7/9pqFDh6pv37569NFHa/2992aOZ2ZmqnXr1rrtttvs6XctWrTw+NmUlZVp3759Tsdbt27tkFr6008/6ZprrtFNN92kSZMmaeHChRo7dqzef/99+8/Pr7/+qt69e6usrEy33nqr2rZtq5deeklXX3218vLyNGrUKPvvx1VXXaWPP/5Y48aN04wZM3T48GHl5+frm2++UdeuXe3vm5ubq8OHD+v3v/+9LBaL5s2bp8zMTG3dulUxMTGS/P+5BRBhDABoxBYuXGhIMj7//HO354wcOdJo2rSp8fPPP9uP7dq1y2jZsqXRr18/+7EePXoYV155pdv7HDx40JBkPPLIIz6P88CBA0ZsbKwxfvx4h+N33nmnIcnYsmWLYRiGUVZW5nTthg0bDEnGyy+/bD+2atUqQ5KxatUq+7HOnTsbkyZNcrq+f//+Rv/+/e3PX3nlFaNJkybG2rVrHc579tlnDUnGp59+6vb7qKysNFJSUozRo0c7HF+6dKkhyVizZo1hGIaxfPlyj78vvpg1a5Yhydi7d6+xevVqQ5Lx2GOP2V/v3Lmzw++dq8/HMAxj27ZthiRj4cKF9mOTJk0yJBkPPvig/djBgweNZs2aGRaLxViyZIn9+A8//GBIMmbNmmU/ZpuDF1xwgXHixAn78Xnz5hmSjBUrVhiGYRiHDx82Wrdubfzf//2fw5h2795ttGrVyuG4bUx33nmnV5+Pt3Pc9v17M4dt57r72rBhg/3czp07G5KMN998037s0KFDRnJysnHeeefZj+Xk5BiSHObe4cOHjS5duhjp6emG1Wo1DMMwXnzxRaffY5vKykqH8bVt29Y4cOCA/fUVK1YYkox33nnHMIy6/dwCiCyk6gGIaFarVR9++KFGjhypk08+2X48OTlZ2dnZWrdunUpLSyWZ/4P+7bff6qeffnJ5r2bNmqlp06YqKChwmYJVmzZt2uiKK67Q22+/raNHj0qSDMPQkiVL1LNnT5166qn297CpqKjQ/v371a1bN7Vu3VpffvmlT+/pzhtvvKHTTz9d3bt31759++xfl156qSRp1apVbq+1WCwaO3as3n33XR05csR+/PXXX1dKSor69u0rqaqGZuXKlS7TE+uiX79+GjhwoObNm6djx44F7L6/+93v7L9u3bq1TjvtNDVv3lxZWVn246eddppat26trVu3Ol0/ZcoU+wqHJN18882Kjo7Wu+++K0nKz89XSUmJxo8f7/C5R0VF6eKLL3b5ud98880ex+3LHPfHlClTlJ+f7/R1xhlnOJzXqVMn+4qRJHu64ldffaXdu3dLkt59911ddNFF9nkiSS1atNCUKVNUWFio7777TpL05ptvql27drrlllucxmNbvbO55ppr1KZNG/vzjIwMSbL/HtXl5xZAZCFwAhDR9u7dq7KyMp122mlOr51++umqrKzUjh07JEl//etfVVJSolNPPVVnn322br/9dv33v/+1nx8bG6uHH35Y7733njp06GBP67L9o9CTCRMm6OjRo1qxYoUkaf369SosLHSoXTl27JjuvfdepaWlKTY2Vu3atVNSUpJKSkp06NChunwUdj/99JO+/fZbJSUlOXzZgrc9e/bUev0111yjY8eO6e2335YkHTlyRO+++67Gjh1r/0dt//79NXr0aM2ZM0ft2rXTiBEjtHDhQo81VN6aPXu2du/erWeffTYg94uLi1NSUpLDsVatWik1NdXpH+qtWrVy+Q/wU045xeF5ixYtlJycbK+jsQXkl156qdNn/+GHHzp97tHR0UpNTfU4dl/muD9OOeUUDRo0yOkrISHB4bxu3bo5fVa2OWX7DH755Re347S9Lkk///yzTjvtNK+6TJ500kkOz21BlO33qK4/twAiB4ETAHipX79++vnnn/Xiiy/qrLPO0j//+U+df/75+uc//2k/JycnRz/++KPmzp2ruLg43XPPPTr99NP11Vdfebz/VVddpVatWtmbC+Tm5ioqKkrjxo2zn3PLLbfogQceUFZWlpYuXaoPP/xQ+fn5atu2rSorK2u9f81/tNrYmjXYVFZW6uyzz3a5ipCfn6+pU6fW+j6XXHKJ0tPTtXTpUknSO++8o2PHjumaa65xGEteXp69zXpRUZFuuOEGXXDBBQ4rVf7q16+fBgwY4HbVydvPwiYqKsqn40Yte3K5Y/v9e+WVV1x+7raA2iY2NlZNmvDXuCfe/B7V5ecWQOSgOQSAiJaUlKT4+Hht2bLF6bUffvhBTZo0UVpamv1YYmKirr/+el1//fU6cuSI+vXrp9mzZzukcXXt2lV/+MMf9Ic//EE//fSTzj33XP3tb3/Tq6++WutYYmNjNWbMGL388sv69ddf9cYbb+jSSy9Vx44d7efk5eVp0qRJ+tvf/mY/dvz4cZWUlHj8Xtu0aePyvF9++cUhhatr1676z3/+o8suu8xtgOFJVlaWnnjiCZWWlur1119Xenq6LrnkEqfzLrnkEl1yySV64IEHlJubqwkTJmjJkiUOn6e/Zs+erQEDBui5555zes226lDz87CtaNSHn376SQMHDrQ/P3LkiIqLi+17INkaGrRv316DBg0K2Pv6Osfry//+9z8ZhuEwp3788UdJsjdg6Ny5s9tx2l6XzM/q3//+tyoqKhzSH+vC359bAJGD/6oCENGioqI0ZMgQrVixwqH18K+//qrc3Fz17dvXnnK0f/9+h2tbtGihbt262dPLysrKdPz4cYdzunbtqpYtW3qdgjZhwgRVVFTo97//vfbu3evUYjoqKsppNeMf//iH25WSmmPZuHGjTpw4YT+2cuVKpzStrKwsFRUV6fnnn3e6x7Fjx+w1WLW55pprVF5erpdeeknvv/++Qx2QZKZJ1fw+zj33XEly+Kx+/vlnl+2lvdG/f38NGDBADz/8sNPvS+fOnRUVFaU1a9Y4HH/mmWf8ei9vLFiwwKGea/78+frtt980bNgwSdLQoUOVkJCgBx980GXdl22PJV/5Msfr065du7R8+XL789LSUr388ss699xz7f85cMUVV+izzz7Thg0b7OcdPXpUCxYsUHp6ur1uavTo0dq3b5+eeuopp/fxdbUvED+3ACIDK04AIsKLL76o999/3+n4jBkzdP/99ys/P199+/bV1KlTFR0dreeee07l5eWaN2+e/dwzzjhDAwYM0AUXXKDExERt2rRJeXl5mj59uiTzf88vu+wyZWVl6YwzzlB0dLSWL1+uX3/91SHdrjb9+/dXamqqVqxYoWbNmikzM9Ph9auuukqvvPKKWrVqpTPOOEMbNmzQRx99pLZt23q89+9+9zvl5eXp8ssvV1ZWln7++We9+uqrDq2bJem6667T0qVLddNNN2nVqlXq06ePrFarfvjhBy1dulQffPCBevbsWet7nX/++erWrZvuvvtulZeXO6TpSdJLL72kZ555RqNGjVLXrl11+PBhPf/880pISLCvwEjSZZddJkl+76cza9Ysh1Uem1atWmns2LH6xz/+IYvFoq5du2rlypUe67fq4sSJE/b5sWXLFj3zzDPq27evrr76aklms4T58+fruuuu0/nnn69x48YpKSlJ27dv17/+9S/16dPHZaDgDW/nuD++/PJLl6syXbt2Va9evezPTz31VN144436/PPP1aFDB7344ov69ddftXDhQvs5d955pxYvXqxhw4bp1ltvVWJiol566SVt27ZNb775pj01ceLEiXr55Zc1c+ZMffbZZ8rIyNDRo0f10UcfaerUqRoxYoTX4w/Ezy2ACBHEjn4AUO9sraDdfe3YscMwDMP48ssvjaFDhxotWrQw4uPjjYEDBxrr1693uNf9999vXHTRRUbr1q2NZs2aGd27dzceeOABe4vpffv2GdOmTTO6d+9uNG/e3GjVqpVx8cUXG0uXLvVpzLfffrshycjKynJ67eDBg8b1119vtGvXzmjRooUxdOhQ44cffnBqNe6u3fbf/vY3IyUlxYiNjTX69OljbNq0yakduWEYxokTJ4yHH37YOPPMM43Y2FijTZs2xgUXXGDMmTPHOHTokFffx913321IMrp16+b02pdffmmMHz/eOOmkk4zY2Fijffv2xlVXXWVs2rTJ4bzOnTsbnTt39vhe1duR19S/f39DklMr+b179xqjR4824uPjjTZt2hi///3vjW+++cZlO/LmzZu7vO+ZZ57pdLxm63PbHFy9erUxZcoUo02bNkaLFi2MCRMmGPv373e6ftWqVcbQoUONVq1aGXFxcUbXrl2NyZMnO3w27sZUG2/meCDbkVefj7bP5IMPPjDOOeccIzY21ujevbvxxhtvON33559/NsaMGWO0bt3aiIuLMy666CJj5cqVTueVlZUZd999t9GlSxcjJibG6NixozFmzBh7y/XavhdVaxkfqJ9bAI2fxTD8qGAFAADwUnp6us466yytXLky2EMBAL9R4wQAAAAAHhA4AQAAAIAHBE4AAAAA4AE1TgAAAADgAStOAAAAAOABgRMAAAAAeBBxG+BWVlZq165datmypSwWS7CHAwAAACBIDMPQ4cOH1alTJ/sm2+5EXOC0a9cupaWlBXsYAAAAAELEjh07lJqaWus5ERc4tWzZUpL54SQkJAR5NFJFRYU+/PBDDRkyRDExMcEeDsIE8wb+YN7AX8wd+IN5A3809LwpLS1VWlqaPUaoTcQFTrb0vISEhJAJnOLj45WQkMAfKvAa8wb+YN7AX8wd+IN5A38Ea954U8JDcwgAAAAA8IDACQAAAAA8IHACAAAAAA8InAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAAAAAMADAicAAAAA8IDAKYisVmn1aovWrEnR6tUWWa3BHhEAAAAAVwicgmTZMik9XRo8OFqPPdZTgwdHKz3dPA4AAAAgtBA4BcGyZdKYMdLOnY7Hi4rM4wRPAAAAQGghcGpgVqs0Y4ZkGM6v2Y7l5Ii0PQAAACCEEDg1sLVrnVeaqjMMaccO8zwAAAAAoYHAqYEVFwf2PAAAAAD1j8CpgSUnB/Y8AAAAAPWPwKmBZWRIqamSxeL6dYtFSkszzwMAAAAQGgicGlhUlPTEE+avawZPtuePP26eBwAAACA0EDgFQWamlJcnpaQ4Hm/Z0jyemRmccQEAAABwjcApSDIzpcJCKT//N11xxVZJUuvW0siRwRwVAAAAAFcInIIoKkrq39/QpEnfKiHB0Pbt0po1wR4VAAAAgJoInEJAbGylxowxd7996aUgDwYAAACAEwKnEHHddZWSzBqno0eDPBgAAAAADgicQkTv3oZOPlk6ckRavjzYowEAAABQHYFTiLBYpIkTzV+TrgcAAACEFgKnEGILnD7+WNqxI7hjAQAAAFCFwCmEdOki9esnGYb02mvBHg0AAAAAm6AGTnPnztWFF16oli1bqn379ho5cqS2bNni8bqSkhJNmzZNycnJio2N1amnnqp33323AUZc/6qn6xlGcMcCAAAAwBTUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6WktbuRMnTmjw4MEqLCxUXl6etmzZoueff14pKSkNOPL6M3as1KyZ9MMP0rPPSosXSwUFktUa7JEBAAAAkSs6mG/+/vvvOzxftGiR2rdvry+++EL9+vVzec2LL76oAwcOaP369YqJiZEkpaen1/dQG0xCgtSzp7R2rTR1atXx1FTpiSekzMzgjQ0AAACIVEENnGo6dOiQJCkxMdHtOW+//bZ69eqladOmacWKFUpKSlJ2drb+9Kc/KSoqyun88vJylZeX25+XlpZKkioqKlRRURHg78B3tjHYHpcvt2jt2ihJFofziooMjRkjLVli1ahR5PBFuprzBvAG8wb+Yu7AH8wb+KOh540v72MxjNCopKmsrNTVV1+tkpISrVu3zu153bt3V2FhoSZMmKCpU6fqf//7n6ZOnapbb71Vs2bNcjp/9uzZmjNnjtPx3NxcxcfHB/R7qCurVZoyZYj2749TzcDJZKhdu2N67rl8uYgRAQAAAPigrKxM2dnZOnTokBISEmo9N2QCp5tvvlnvvfee1q1bp9TUVLfnnXrqqTp+/Li2bdtmX2F67LHH9Mgjj6i4uNjpfFcrTmlpadq3b5/HD6chVFRUKD8/X4MHD9b69U01eLDnRcD8/N/Uv39I/LYhSKrPG1vKKuAJ8wb+Yu7AH8wb+KOh501paanatWvnVeAUEql606dP18qVK7VmzZpagyZJSk5OVkxMjENa3umnn67du3frxIkTatq0qcP5sbGxio2NdbpPTExMSP0Qx8TEaO9e73479u6NVggNHUEUavMY4YF5A38xd+AP5g380VDzxpf3CGpXPcMwNH36dC1fvlyffPKJunTp4vGaPn366H//+58qKyvtx3788UclJyc7BU3hJjk5sOcBAAAACIygBk7Tpk3Tq6++qtzcXLVs2VK7d+/W7t27dezYMfs5EydO1F133WV/fvPNN+vAgQOaMWOGfvzxR/3rX//Sgw8+qGnTpgXjWwiojAyze57FVXmTzONpaeZ5AAAAABpOUAOn+fPn69ChQxowYICSk5PtX6+//rr9nO3btzvULqWlpemDDz7Q559/rnPOOUe33nqrZsyYoTvvvDMY30JARUWZLccl98HT44+LxhAAAABAAwtqjZM3fSkKCgqcjvXq1UsbN26shxEFX2amlJcnzZgh7dxZdbxFC+mll9jHCQAAAAiGoK44wbXMTKmwUFq1SrItpEVHS1dcEdRhAQAAABGLwClERUVJAwZI999v1j2VlEhvvx3sUQEAAACRicApxEVFSRMnmr9etCioQwEAAAAiFoFTGJg82Xz84AOpqCioQwEAAAAiEoFTGDjlFKlvX6myUnrllWCPBgAAAIg8BE5h4vrrzceFCyUvmhECAAAACCACpzAxdqwUHy/9+KPUSDuxAwAAACGLwClMtGwpjRlj/nrhwuCOBQAAAIg0BE5hxJaut2SJVFYW3LEAAAAAkSQ62AOA9/r1k7p0kbZtkx54QDrrLCk5WcrIMNuWAwAAAKgfBE5hpEkT6aKLzMDpwQerjqemSk88IWVmBm9sAAAAQGNGql4YWbZMWrrU+XhRkVn/tGxZw48JAAAAiAQETmHCapVmzHDditx2LCfHPA8AAABAYBE4hYm1a6WdO92/bhjSjh3meQAAAAACi8ApTBQXB/Y8AAAAAN4jcAoTycmBPQ8AAACA9wicwkRGhtk9z2Jxf05amnkeAAAAgMAicAoTUVFmy3HJffA0Ywb7OQEAAAD1gcApjGRmSnl5UkqK4/G4OPPxqaekPXukggJp8WLzkS57AAAAQN2xAW6YycyURowwu+cVF5s1TWeeKV1yibR1q3TSSVJ5edX5bI4LAAAA1B0rTmEoKkoaMEAaP958TEqSpk0zX6seNElsjgsAAAAEAoFTI2C1Sn//u+vX2BwXAAAAqDsCp0aAzXEBAACA+kXg1AiwOS4AAABQvwicGgE2xwUAAADqF4FTI8DmuAAAAED9InBqBLzZHPeee9gcFwAAAPAXgVMj4W5z3JgY8/HVV6UTJ9gcFwAAAPAHG+A2Iq42x+3YUbrwQmnNGnO/p9LSqvPZHBcAAADwDitOjUzNzXG7d5duuMF8rXrQJLE5LgAAAOAtAqdGzmp1HxixOS4AAADgHQKnRo7NcQEAAIC6I3Bq5NgcFwAAAKg7AqdGjs1xAQAAgLqjq14jZ9sct6ioqqapptRUqXdvs0W5rRtfRgb7PgEAAAA2BE6NnG1z3DFjzM1xXQVPMTHSySebwZUNrcoBAACAKqTqRQB3m+MmJUnR0dK2bY5Bk0SrcgAAAKA6AqcIkZkpFRZKq1ZJubnm486dUps2rs+nVTkAAABQhVS9CGLbHNemoEDau9f9+dVblVe/DgAAAIg0BE4RzNsW5EVFNI4AAABAZCNwimDetiC/7TbHlSkaRwAAACDSUOMUwWytyi2W2s+rmc5H4wgAAABEGgKnCGZrVS55Dp6qo3EEAAAAIg2BU4SrrVV5bao3jgAAAAAaO2qcoMxMacQIMwiyNYAoKpKuvdbztd42mAAAAADCGYETJLluVe4NbxtMAAAAAOGMVD245E3jCFtrcgAAAKCxI3CCS940jvjtN7POqaBAWrzYfKRZBAAAABojUvXglq1xxIwZ0s6dVcc7dZIqK6Xdu6Vu3RyDJfZ4AgAAQGPEihNqlZkpFRZKq1ZJubnm4/bt0pw55us1V5jY4wkAAACNEStO8Khm4wirVbrvPtfnGoaZ2peTY3bqi4pqiBECAAAA9YsVJ/hs7VrH1L2aqu/xZLVSAwUAAIDwx4oTfObt3k0rVkjXXecYZFEDBQAAgHDEihN85u3eTY8/7rwyRQ0UAAAAwlFQA6e5c+fqwgsvVMuWLdW+fXuNHDlSW7Zs8fr6JUuWyGKxaOTIkfU3SDjxZo8ndwzDfMzJkU6cII0PAAAA4SGogdPq1as1bdo0bdy4Ufn5+aqoqNCQIUN09OhRj9cWFhbqj3/8ozLYgbXB1bbHkzfBlK0GKjVVGjhQys42H9PTWYkCAABAaApq4PT+++9r8uTJOvPMM9WjRw8tWrRI27dv1xdffFHrdVarVRMmTNCcOXN08sknN9BoUZ1tj6eUFMfjqanmapI39u51fE4aHwAAAEJVSDWHOHTokCQpMTGx1vP++te/qn379rrxxhu1du3aWs8tLy9XeXm5/XlpaakkqaKiQhUVFXUccd3ZxhAKY/HV8OHSFVdI69ZZVFxs1j717Wto3TqLHn/c96lltjI3NGOGdMUVv9HKvBbhPG8QPMwb+Iu5A38wb+CPhp43vryPxTBsVSfBVVlZqauvvlolJSVat26d2/PWrVuncePGafPmzWrXrp0mT56skpISvfXWWy7Pnz17tubYdmutJjc3V/Hx8YEaPqqxWqUpU4Zo//44SX4UQkmaM2edmjSRDh6MU5s2x3XGGfsJpAAAABBQZWVlys7O1qFDh5SQkFDruSETON1888167733tG7dOqWmpro85/DhwzrnnHP0zDPPaNiwYZLkMXByteKUlpamffv2efxwGkJFRYXy8/M1ePBgxcTEBHs4AbN8uUXjxpmRjmFUD54MeRNMJSYaOnCg6ryUFEOPPWbVqFEhMV2DrrHOG9Qv5g38xdyBP5g38EdDz5vS0lK1a9fOq8ApJFL1pk+frpUrV2rNmjVugyZJ+vnnn1VYWKjhw4fbj1VWVkqSoqOjtWXLFnXt2tXhmtjYWMXGxjrdKyYmJqR+iENtPHWVlSVFR0szZji2JE9KsjjVNrlSPWiSpF27LBo3Llp5edKIEebmurb0wIwMRexqVGObN2gYzBv4i7kDfzBv4I+Gmje+vEdQAyfDMHTLLbdo+fLlKigoUJcuXWo9v3v37vr6668djv3lL3/R4cOH9cQTTygtLa0+hwsfZWY6Bzm9e0tdu5qNIHxZ6zTrn6QpU5yDMTbVBQAAQH0LauA0bdo05ebmasWKFWrZsqV2794tSWrVqpWaNWsmSZo4caJSUlI0d+5cxcXF6ayzznK4R+vWrSXJ6ThCQ1SUNGCA47EnnjC751ksvgdP+/c7H7d148vLI3gCAABA/QhqO/L58+fr0KFDGjBggJKTk+1fr7/+uv2c7du3q7i4OIijRKC5a2XuoZmiW2yqCwAAgPoW9FQ9TwoKCmp9fdGiRYEZDBqUqzQ+q1UaNMi/+1XfVLd6DRVpfAAAAAiEkGgOgchUM43PajUDHV/rn6pzt6kuaXwAAACoi6Cm6gHVRUWZq0OSWf8UCNXT+EjbAwAAgL8InBBS3NU/paZKbdv6F1DZ0vjWrg3MGAEAABB5SNVDyHFV/5SRIa1Y4V83PpuiIrNhBPs/AQAAwFcETghJrtqY21ajnDfVda5tcuW222gcAQAAAP+QqoewkpkpFRZKq1ZJubnm486dZhDkKY3PXeOIZcvqbbgAAABoJAicEHZsq1Hjx5uPTZv611SCxhEAAADwFoETGgV3TSWSkmq/ztY4oqCAjXMBAADgHjVOaDRcNZUoKpKuvdbztVlZ0oEDVc+pfwIAAEB1BE5oVGo2lSgo8O666kGTxMa5AAAAcESqHhq1jAzvGkfUVL3+6cQJ0vgAAAAiHStOaNSiosyUO3/2f7LVP6Wmum9jbrU67zfF3lAAAACNDytOaPTcNY5ITPTuendtzO+4Q0pPlwYOlLKzzcf0dNqbAwAANEasOCEiuGocYbVKgwb5fi/bqtUjjzi/VrM2ihUpAACAxoHACRGjZuMIq9VMuysq8i2FrzaGYaYE5uRIlZXSbbeZG/Ta0K0PAAAgPJGqh4hlq3+SfG8eURtbbdTYsY5Bk1S1IkU6HwAAQHghcEJE83fjXH9V79ZHdz4AAIDwQeCEiJeZKRUWSqtWSbm55uPOnf61MfeGbUVq7drA3xsAAAD1gxonQM71T5L/bcy9VVRk7gtF4wgAAIDQx4oT4Ia7NL60NOn2282AquaKlC8rVLfdRitzAACAcEHgBNTCVRrftm3SvHmug6rUVGnpUu/S/NztD0XwBAAAEHpI1QM8cJXGJ7neG8qWbhcV5XuaX/VW5lddJa1fTxofAABAqCBwAuqgtqAqL0+aMcOxJXlSkvNKU3W2xhGpqY7nsf8TAABAcJGqB9QTV2l+f/+7d9eSxgcAABBaWHEC6lHNFamCAv/uUzONb80ai9asSVHz5hYNHEgaHwAAQH0jcAIaUEaGmXZXVOR7i3PHNL5oST312GOOaXxWq+uaKwAAANQNqXpAA4qKMoMcyf/Ndd2l8d1xh9nSnBbnAAAAgUfgBDQwd/tDJSX5dz/DML8eecSxEYVEbRQAAECgEDgBQeCqccTOnd7t/+QLWzpgTo6ZxgcAAAD/UOMEBImrVuZPPOH7/k+e2Gqj1q513TodAAAAnrHiBISQQKfxVVdcXPd7AAAARCpWnIAQk5kpjRjh2B2vd2+pa1f/uvHZtG9vtkOn4x4AAIDvCJyAEBToNL64OGnSJDPwsqnexhwAAAC1I1UPCBPu0vjS0qTbbzcDKneNJY4fdwyaJMeOe1aruRq1eLH5WL2RRG2vAQAARApWnIAwYkvjW7XqN7333mYNG3auBg6MVlSUdMkl0owZji3JU1OlkhLpyBHnexmGGWhNmeL6Ott+U+5eY6UKAABEEgInIMxERUn9+xs6erRI/fv3sNcpuaqNslqlQYPc38swpP37nY8XFUmjR7u+xrZSlZdH8AQAACIHgRPQiNSsjVq82L/71FZDZVupyskxAzUaTAAAgEhAjRPQiCUn1899q+8NRQ0UAACIBAROQCOWkWHWJLlrGlFXK1ZI6enSwIFSdrb5mJ5uNpwAAABoTAicgEYsKqqqyUN9BE+PP+7YOEJy7NYHAADQWBA4AY2cuzbmqalS27aBD6hs9VE5OdKJE6TxAQCAxoHmEEAEcNVxLyPDTLVztalu9ee1veaOrQYqNVXau7fqOK3MAQBAuGLFCYgQto5748ebj1FRta9Gvfmm+eXqtZwc796zetAkkcYHAADCFytOQIRztxplazPu6rW1a836Jl/RyhwAAIQrAicATvs/eXrN1q2vqMhz2l5N1VuZu3tPAACAUEPgBMBntm59ruqjvFVUZDaMcLXKZbW6XwEDAAAIBgInAH6x1UfNmOHYkjwpybm2yZXbbnPdOEJyvidNJQAAQLAROAHwm6v6qN69pa5dPafxuWocMXq063NtTSXy8gieAABAcNBVD0Cd1OzW17Spf5vu1hZkVd8bir2gAABAMBA4AQg4d23Ok5L8v2f1phIAAAANjVQ9APXCVRpfUZF07bV1u29xcWDGBwAA4AsCJwD1pmYr84KCut+zfXu68QEAgIZH4ASgwdRl/ydJio6WrrvOcdWJbnwAAKAhUOMEoMHY9n+SnBtHVH/urqnEb785p+rZuvGNHu0YNNleGzNGWrasbuMGAAAIauA0d+5cXXjhhWrZsqXat2+vkSNHasuWLbVe8/zzzysjI0Nt2rRRmzZtNGjQIH322WcNNGIAdeWucURqqvTmm+aXq9dat3Z9P7rxAQCAhhDUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6evSo22sKCgo0fvx4rVq1Shs2bFBaWpqGDBmioqKiBhw5gLrIzJQKC6VVq6TcXPNx2zbzuKvXFi2SSkr8ey+68QEAgEAIao3T+++/7/B80aJFat++vb744gv169fP5TWvvfaaw/N//vOfevPNN/Xxxx9r4sSJ9TZWAIFVs3FEba8tXlz39ysqct9UAgAAwJOQag5x6NAhSVJiYqLX15SVlamiosLtNeXl5SovL7c/Ly0tlSRVVFSooqKiDqMNDNsYQmEsCB+RNm+Skiyq6x9XOTmG9u2rKp5KSTH02GNWjRrlR5eKMBVp8waBw9yBP5g38EdDzxtf3sdiGP70tgq8yspKXX311SopKdG6deu8vm7q1Kn64IMP9O233youLs7p9dmzZ2vOnDlOx3NzcxUfH1+nMQNoGFarNGXKEO3fHyfJVecI2x9jbrpKuHzdPPanP32uiy4q1nfftdXBg3Fq0+a4zjhjP6tRAABEgLKyMmVnZ+vQoUNKSEio9dyQCZxuvvlmvffee1q3bp1SU1O9uuahhx7SvHnzVFBQoHPOOcflOa5WnNLS0rRv3z6PH05DqKioUH5+vgYPHqyYmJhgDwdhIhLnzfLlFo0bZ0YzhlEVAFkshr0JhMXi+JqngMpiMZSYKMXFSUVFjX81KhLnDQKDuQN/MG/gj4aeN6WlpWrXrp1XgVNIpOpNnz5dK1eu1Jo1a7wOmh599FE99NBD+uijj9wGTZIUGxur2NhYp+MxMTEh9UMcauNBeIikeZOVZe7j5LxXk0WPP27+uuZrSUkW7d3r/p6GYdH+/c7Hd+2yaNy4aOXlNc49oCJp3iCwmDvwB/MG/mioeePLewQ1cDIMQ7fccouWL1+ugoICdenSxavr5s2bpwceeEAffPCBevbsWc+jBBAqMjOlESPMDnmumjzUfK2oSLr2Wt/fxzDM1aucHPOepO0BAICgBk7Tpk1Tbm6uVqxYoZYtW2r37t2SpFatWqlZs2aSpIkTJyolJUVz586VJD388MO69957lZubq/T0dPs1LVq0UIsWLYLzjQBoML504yso8P99bG3MCwrM+9KNDwCAyBbUfZzmz5+vQ4cOacCAAUpOTrZ/vf766/Zztm/fruLiYodrTpw4oTFjxjhc8+ijjwbjWwAQwjIyzM1zLe56RnghK0saOFDKzjYf09OlZcsCNkQAABAmgp6q50lBjf8yLiwsrJ/BAGh0oqKkJ56QxoyxNY7w/R4HDjg+Lyoy79dY658AAIBrQV1xAoD6lplpBjkpKY7HU1Oltm19X42yBV85OWabdAAAEBkInAA0epmZUmGhtGqVlJtrPhYWSgsWmK/7Ezzt2GE2orBazTqoxYvNR4IpAAAap5BoRw4A9c1VUwnbalTNNuaJic4peq6sWCFdd13N9uhmemBmphlEuesACAAAwguBE4CI5qrFudUqDRrk+Vrb/lHV2Wqg/vhHcxXKXVAFAADCC4ETgIhXczXKajWDnKIi3xtK2M5/5BHn12gsAQBA+KLGCQBqsHXjk5zrn+rS2pzGEgAAhC8CJwBwobZufDk5/t+3emMJAAAQPgicAMANV934tm0za6Lqqtq+3gAAIAxQ4wQAtXDVjS8jw/8aKJv27c325XTcAwAgPBA4AYCPbDVQY8aYNU++Bk9xcdKkSWbgZUPHPQAAQhupegDgB3c1UGlp0u23mwGVu0YSx487Bk1SVce9ZcvYVBcAgFDEihMA+MnVHlC2lLtLLnHeWDc1VSopkY4ccb6XYZiB1pQprq9jNQoAgOAicAKAOnBVAyX5t7GuYUj79zsfZ/8nAACCj8AJAOpJzaBq8WL/7mNbjcrJMYMxmkgAANDwCJwAoIEkJ/t/rW3/p4ICM3CiGx8AAA2LwAkAGkgg2phnZUkHDlQ9r17/ZLW6rrcCAAB1R1c9AGggtjbmkvuOe55UD5qkqvqnO+6Q0tOlgQOl7GzzMT3d7NIHAADqjsAJABqQuzbmqalS27a+B1SGYX498ohjJz7JscU5AACoGwInAGhgmZlSYaG0apWUm2s+FhZKCxaYr/u7GlWTLR0wJ0c6cUJavdqiNWtStHq1hb2hAADwETVOABAErtqY21ajau7jlJjonKLnLVtTidRUae/eaEk99dhj7A0FAICvWHECgBDiajVq6dK633fvXsfnpPEBAOCbOq04HT9+XHFxcYEaCwBAzqtRVmvdu/HVxN5QAAD4xucVp8rKSt13331KSUlRixYttHXrVknSPffcoxdeeCHgAwSASBeIbnyuVN8bqqDA3KC3oEDUPwEA4ILPgdP999+vRYsWad68eWratKn9+FlnnaV//vOfAR0cAMDkrhtfWpp0++1mQOVvUJWVRRtzAAA88Tlwevnll7VgwQJNmDBBUdVyO3r06KEffvghoIMDAFRxVf+0bZs0b57roCopybv7utsbiuAJAIAqPtc4FRUVqVu3bk7HKysrVVFREZBBAQBcc9WNTzKDqhEjpLVrpeJiKTlZ6t1b6trV99qomvVPkuN9MzKoiQIARB6fA6czzjhDa9euVefOnR2O5+Xl6bzzzgvYwAAAvnEVVD3xhLl6ZLH4Hjzt2CE98ID0/POO7dFpZQ4AiEQ+B0733nuvJk2apKKiIlVWVmrZsmXasmWLXn75Za1cubI+xggA8FNd94aaNcv5mC2VLy+P4AkAEDl8rnEaMWKE3nnnHX300Udq3ry57r33Xn3//fd65513NHjw4PoYIwCgDmy1Ufn5v2nmzE3Kz/+tTntD2VaucnKkEyfoyAcAiAx+7eOUkZGh/Pz8QI8FAFBPoqKk/v0NHT1apP79e6hJk7rtDWVL5UtNddxclzQ+AEBj5fOKEwAg/NW2N5Qvbc2rB00SHfkAAI2Xz4FTkyZNFBUV5fYLABAe3O0NlZoqzZnj3z1J4wMANFY+p+otX77c4XlFRYW++uorvfTSS5rj79+0AICgcNXGPCPDfO355/1L5SONDwDQGPkcOI2wbepRzZgxY3TmmWfq9ddf14033hiQgQEAGoa7vaH8bWVu4y6Nj258AIBwFLAap0suuUQff/xxoG4HAAgyd6l8SUn+3a96Gh9pewCAcONXV72ajh07pieffFIpNf92BQCENVepfL17S1271i2Nr6DAXOmqnh5ImSwAIJT5HDi1adNGlmotlwzD0OHDhxUfH69XX301oIMDAASfq1S+uqbxZWU5bsBbvf7JanWuuSKoAgAEm8+B09///neHwKlJkyZKSkrSxRdfrDZt2gR0cACA0GRL45sxQ9q5s+p4UpJzbZMr1YMmqar+6Y9/NLvwVb8nTSUAAKHA58Bp8uTJ9TAMAEC4CWQan+3cRx5xfo2mEgCAUOBV4PTf//7X6xuec845fg8GABBe6iONrybDMO+VkyNddZW0fj1pfACAhudV4HTuuefKYrHI8PA3oMVikZVWSQAQ0dyl8SUmOqfoeYu9oQAAweZV4LRt27b6HgcAoBFxlcZntUqDBtXtvuwNBQAIFq8Cp86dO9f3OAAAjUzNND6r1Vwh8qeNuTvV0/hGjCBtDwBQf/zex+m7777T9u3bdeLECYfjV199dZ0HBQBofKKiAl//JFWl8a1d61xvBQBAoPgcOG3dulWjRo3S119/7VD3ZGtRTo0TAMAdd/VPaWnSuHHSo4+az/0JqoqKzI11aRwBAKgPTXy9YMaMGerSpYv27Nmj+Ph4ffvtt1qzZo169uypgoKCehgiAKAxycyUCgulVauk3Fzzcds2ad48M6hKSXE8PynJu/vedps0cKCUnW0+pqdLy5YFevQAgEjl84rThg0b9Mknn6hdu3Zq0qSJmjRpor59+2ru3Lm69dZb9dVXX9XHOAEAjYirNuZS3faGonEEAKA++bziZLVa1bJlS0lSu3bttGvXLklmA4ktW7YEdnQAgIhjC6rGjzcfmzY1a6MkszbKW7YgKydHOnHCTONbvNh8JKscAOArn1eczjrrLP3nP/9Rly5ddPHFF2vevHlq2rSpFixYoJNPPrk+xggAiHDuaqOSkpxXmqpj/ycAQKD4HDj95S9/0dGjRyVJf/3rX3XVVVcpIyNDbdu21euvvx7wAQIAILlO4ysqkq691vO1pPEBAOrK68CpZ8+e+t3vfqfs7GwlJCRIkrp166YffvhBBw4cUJs2beyd9QAAqA81a6P87UnE/k8AAF95XePUo0cP3XHHHUpOTtbEiRMdOuglJiYSNAEAGlxGhpl2589fQbY0voIC6p8AAJ55HTi98MIL2r17t55++mlt375dl112mbp166YHH3xQRUVF9TlGAABcsm2qK/kXPElSVhZtzAEAnvnUVS8+Pl6TJ09WQUGBfvzxR40bN07PPfec0tPTdeWVV2qZj3/TzJ07VxdeeKFatmyp9u3ba+TIkV515nvjjTfUvXt3xcXF6eyzz9a7777r0/sCABoPW+MIf/d/OnDA8bmt/ongCQBQnc/tyG26du2q+++/X4WFhVq8eLE2btyosWPH+nSP1atXa9q0adq4caPy8/NVUVGhIUOG2JtPuLJ+/XqNHz9eN954o7766iuNHDlSI0eO1DfffOPvtwIACHOuNtXdudO/NL7qbcytVvOLVD4AgM9d9aorKCjQwoUL9eabbyo6Olr/93//59P177//vsPzRYsWqX379vriiy/Ur18/l9c88cQTuvzyy3X77bdLku677z7l5+frqaee0rPPPuvfNwIACHuuNtV94glz9chiqX3z3Jps9U8PPCA9/7xjC3RamQNAZPI5cNq5c6cWLVqkRYsWaevWrcrIyNAzzzyjsWPHqlmzZnUazKFDhySZzSbc2bBhg2bOnOlwbOjQoXrrrbdcnl9eXq7y8nL789LSUklSRUWFKioq6jTeQLCNIRTGgvDBvIE/InHeDB8uLVli0cyZUSoqqlp6Skw0dOCA56WoWbNs0VbVuUVFhsaMkZYssWrUKB+isTAWiXMHdce8gT8aet748j4Ww/Du/+CWLl2qF198UR9//LHat2+vSZMm6YYbblC3bt38Hmh1lZWVuvrqq1VSUqJ169a5Pa9p06Z66aWXNH78ePuxZ555RnPmzNGvv/7qdP7s2bM1Z84cp+O5ubmKj48PyNgBAKHNapW++66tDh6MU5s2x1VZKc2a1deLKw1VD5qqH2/X7pieey6fVuYAEMbKysqUnZ2tQ4cO2bdccsfrFadrr71WV155pZYvX64rrrhCTZr4XR7l0rRp0/TNN9/UGjT546677nJYoSotLVVaWpqGDBni8cNpCBUVFcrPz9fgwYMVExMT7OEgTDBv4I9InzfDh1f92mqVFiwwtGuXZBiuAyMzYHK3KmXRvn3xat78SkVFVW3I27ev0SgDqUifO/AP8wb+aOh5Y8tG84bXgdPOnTvVvn17vwbkyfTp07Vy5UqtWbNGqamptZ7bsWNHp5WlX3/9VR07dnR5fmxsrGJjY52Ox8TEhNQPcaiNB+GBeQN/MG+kmBjpySdd1z+Zz73rKJGdHe3Qla+x1z8xd+AP5g380VDzxpf38HrZqD6CJsMwNH36dC1fvlyffPKJunTp4vGaXr166eOPP3Y4lp+fr169egV8fACAxstdG/PUVMlFhrdLtDIHgMhRp656dTVt2jTl5uZqxYoVatmypXbv3i1JatWqlb3RxMSJE5WSkqK5c+dKkmbMmKH+/fvrb3/7m6688kotWbJEmzZt0oIFC4L2fQAAwlNmpjRihLR2bVW6XUaG+drzz5uBkK/d+CwWs5X5VVdJ69c73rcxpvEBQKQIauA0f/58SdKAGv1jFy5cqMmTJ0uStm/f7lBP1bt3b+Xm5uovf/mL/vznP+uUU07RW2+9pbPOOquhhg0AaERctTGX6t7KPDVV2ru36nhjT+MDgMYuqIGTNw39CgoKnI6NHTvW5812AQDwhS2Vb8YMx32cEhOdU/RcqR40SVVpfHl5BE8AEI58bo23Y8cO7az2N8hnn32mnJwcUuUAAI1OZqZUWCitWiXl5pqPS5f6dy/b/xXm5Jhd/QAA4cXnFafs7GxNmTJF1113nXbv3q3BgwfrzDPP1Guvvabdu3fr3nvvrY9xAgAQFDVT+axWM+3O1/onqSqNr6BADm3MqX8CgNDn84rTN998o4suukiSuSnuWWedpfXr1+u1117TokWLAj0+AABCSlSUWaskmfVP/sjKkgYOlLKzzcf0dDrxAUCo8zlwqqiosO+L9NFHH+nqq6+WJHXv3l3FxcWBHR0AACHIXSvzpCTvrqeNOQCEH58DpzPPPFPPPvus1q5dq/z8fF1++eWSpF27dqlt27YBHyAAAKHIVf3Tzp1mGp+vK1HUPwFA6PO5xunhhx/WqFGj9Mgjj2jSpEnq0aOHJOntt9+2p/ABABAJXLUyr2sbc+qfACA0+Rw4DRgwQPv27VNpaanatGljPz5lyhTFx8cHdHAAAISburYxz8pyPI/9nwAgNPicqnfs2DGVl5fbg6ZffvlFjz/+uLZs2aL27dsHfIAAAISburQxp/4JAEKTzytOI0aMUGZmpm666SaVlJTo4osvVkxMjPbt26fHHntMN998c32MEwCAsBKoNuaGYab95eRIV10lrV9PGh8ABIPPK05ffvmlMjIyJEl5eXnq0KGDfvnlF7388st68sknAz5AAAAag7q0MbfVP6Wm0sYcAILF58CprKxMLVu2lCR9+OGHyszMVJMmTXTJJZfol19+CfgAAQBoLNy1MU9M9O76vXsdn5PGBwANx+fAqVu3bnrrrbe0Y8cOffDBBxoyZIgkac+ePUpISAj4AAEAaEzqUv9UE23MAaDh+FzjdO+99yo7O1u33XabLr30UvXq1UuSufp03nnnBXyAAAA0NoGqf5JoYw4ADcXnwGnMmDHq27eviouL7Xs4SdJll12mUaNGBXRwAABEAlv9kz/7P9nQxhwA6pfPqXqS1LFjR5133nnatWuXdv7/TSouuugide/ePaCDAwAgUrirf0pK8u562pgDQP3yOXCqrKzUX//6V7Vq1UqdO3dW586d1bp1a913332qrKysjzECABARXNU/7dxprh7504lPMuufTpwwU/kWLzYfqYcCAN/5nKp3991364UXXtBDDz2kPn36SJLWrVun2bNn6/jx43rggQcCPkgAACJFzfonyf80vuptzKt35CONDwB85/OK00svvaR//vOfuvnmm3XOOefonHPO0dSpU/X8889r0aJF9TBEAAAiG23MASD4fA6cDhw44LKWqXv37jpQM8EaAAAEBG3MASC4fE7V69Gjh5566ik9+eSTDsefeuophy57AAAgsOqjjfnatWbr8rVraWUOALXxOXCaN2+errzySn300Uf2PZw2bNigHTt26N133w34AAEAgGuBaGO+YoV03XVmEwobaqAAwJnPqXr9+/fXjz/+qFGjRqmkpEQlJSXKzMzUli1blJGRUR9jBAAAbtS1jfnjjzsGTRI1UADgis8rTpLUqVMnp+55O3fu1JQpU7RgwYKADAwAAHgnM1MaMcIx3a53b6lrV//T+CwWswZqxIh6GTIAhB2/NsB1Zf/+/XrhhRcCdTsAAOADW/3T+PHmY9OmZrqd5LwHlDd7QtlqoAoKpNWrLVqzJkWrV1toJgEgYgUscAIAAKHFXRpfaqq5muSNrCxp8OBoPfZYTw0eHK30dFL4AEQmAicAABoxV23Mt23zPgWv5k4j1D8BiFR+1TgBAIDwUbONuWS2HPenlXnN+ifalgOIFF4HTpkeepKWlJTUdSwAAKCB1KWVefX6p6go9n8CEBm8DpxatWrl8fWJEyfWeUAAAKBh2GqgZsxwbEmemOicoudKVpbjeez/BKAx8zpwWrhwYX2OAwAABIGrVuZWqzRokOdr3dU/5eURPAFofKhxAgAgwtWsgbJaqX8CgJoInAAAgAPqnwDAGe3IAQCAE3d7QCUmend9VpY0cKCUnW0+sv8TgHBH4AQAAFyy7QGVn/+bZs7cpPz837R0qXfXsv8TgMaGVD0AAOBWVJTUv7+ho0eL1L9/DzVpUvf6p6uuktavJ40PQHghcAIAAF4LRP1Taqq0d2/VcdqYAwgHpOoBAACf1LX+qXrQJJHGByA8sOIEAAB8Vpf9n2oijQ9AOCBwAgAAfgnU/k8SaXwAQh+pegAAICBs9U+SuYLkD9L4AIQqAicAABAw7uqfkpL8u59t5Sonx1zRAoBgIXACAAABZdv/adUqKTfXfNy500y782clypbGt3ZtwIcKAF6jxgkAAARczfonyf825jbFxQEZGgD4hRUnAADQIOqaxte+vVRQIC1ebD6SugegIbHiBAAAGoyrNua9e0tdu9beja9FC2nyZDPlz4aOewAaEoETAABoUP6k8R05Yn5VZ+u4l5dH8ASg/pGqBwAAgs5dGl9KihQX5/qa6h33TpwgjQ9A/WLFCQAAhARXaXxWqzRokPtr2DgXQEMhcAIAACGjZhrf4sXeXedu41zS+AAECql6AAAgZCUn+3cdG+cCCDRWnAAAQMjKyDDT7mrruOeOLY2voMBcybKl/2VkmM8BwBcETgAAIGRFRdV949ysLOnAgarn1D8B8AepegAAIKTVdePc6kGTVFX/tGxZYMYHIDIQOAEAgJCXmSkVFkqrVkm5uebjzp3m6pHF4tu9atY/Wa20MgfgWVADpzVr1mj48OHq1KmTLBaL3nrrLY/XvPbaa+rRo4fi4+OVnJysG264Qfv376//wQIAgKCyddwbP958bNrUTLmT/AueduyQHnhASk+XBg6UsrPNx/R0VqMAOAtq4HT06FH16NFDTz/9tFfnf/rpp5o4caJuvPFGffvtt3rjjTf02Wef6f/+7//qeaQAACAUuUvjS0z07vpZs8yVq+pI5QPgSlCbQwwbNkzDhg3z+vwNGzYoPT1dt956qySpS5cu+v3vf6+HH364voYIAABCnD8b59bGMMwVrJwc87504AMghVlXvV69eunPf/6z3n33XQ0bNkx79uxRXl6errjiCrfXlJeXq7y83P68tLRUklRRUaGKiop6H7MntjGEwlgQPpg38AfzBv4Kl7nTp0/Vr61WKSUlWrt2SYbhKo/PkOQ+v8+Wyrdq1W/q39+PVn4Im3mD0NLQ88aX97EYhj+NPQPPYrFo+fLlGjlyZK3nvfHGG7rhhht0/Phx/fbbbxo+fLjefPNNxcTEuDx/9uzZmjNnjtPx3NxcxcfHB2LoAAAgBG3YkKyHH77w/z+rHiQZLo65lpOzSW3bHtfBg3Fq0+a4zjhjPytQQCNSVlam7OxsHTp0SAkJCbWeG1aB03fffadBgwbptttu09ChQ1VcXKzbb79dF154oV544QWX17hacUpLS9O+ffs8fjgNoaKiQvn5+Ro8eLDb4A+oiXkDfzBv4K9wnjvLl1s0c2aUioqqgqTUVEM33FCpv/7VcwTUrp2hffuqrk1JMfTYY1aNGhUS/3wKaeE8bxA8DT1vSktL1a5dO68Cp7BK1Zs7d6769Omj22+/XZJ0zjnnqHnz5srIyND999+v5ORkp2tiY2MVGxvrdDwmJiakfohDbTwID8wb+IN5A3+F49zJypJGj3asf8rIsEiK0osvmo0gavsv5OpBkyTt2mXRuHHRystzrqvKyKAeypVwnDcIvoaaN768R1gFTmVlZYqOdhxy1P//EypEFs4AAECIsbUxr+mJJ8zueRZL7cFTdbbGEVOmSDNmOHbkS00175mZGZBhAwgxQW1HfuTIEW3evFmbN2+WJG3btk2bN2/W9u3bJUl33XWXJk6caD9/+PDhWrZsmebPn6+tW7fq008/1a233qqLLrpInTp1Csa3AAAAwpS7VuZJSbVfZxjS/v20MQciTVBXnDZt2qSBAwfan8+cOVOSNGnSJC1atEjFxcX2IEqSJk+erMOHD+upp57SH/7wB7Vu3VqXXnop7cgBAIBfXLUyLyqSrr3W93vRxhxo3IIaOA0YMKDWFLtFixY5Hbvlllt0yy231OOoAABAJKmZyldQ4P+9bG3M1651nR4IIHwFNVUPAAAg1GRkmPVKFs/dyt0qLg7ceACEBgInAACAaqKizCYPkv/BU/v25srV4sXmo9UaqNEBCBYCJwAAgBrcNY5ITZXatq09oGrWTJo0SRo4UMrONh/T02kaAYQ7AicAAAAXMjOlwkJp1SopN9d8LCyUFiwwX3cXPB07ZjaYqI6Oe0D4C6t9nAAAABqSqz2gbKtRrvZxKimRjhxxvk/1jntXXSWtX8/GuUC4IXACAADwkas25larNGiQ+2tsHfdSU6W9e6uOs3EuEB4InAAAAPxQczVq8WLvrqseNElVaXx5eQRPQCijxgkAACAAkpP9u862pWVODt33gFBG4AQAABAAddn/qfrGuQBCE4ETAABAAARi/yc2zgVCF4ETAABAgLjb/ykpybvr2TgXCF00hwAAAAggVx33eveWunY1G0HYappqat5cmjzZucU5HfeA0EDgBAAAEGCu9n964gmze57F4jp4OnrU/KqOjntA6CBVDwAAoAG4S+NLTZXi411fU73j3okTpPEBwcSKEwAAQANh41wgfBE4AQAANCA2zgXCE6l6AAAAQcTGuUB4IHACAAAIIjbOBcIDgRMAAEAQBWLj3KIiGkcA9Y3ACQAAIMjqunHubbdJAwdK2dnmY3q6tGxZwIcJRDQCJwAAgBCQmSkVFkqrVkm5uebjzp3epfG5axxB8AQEDoETAABAiLB13Bs/3nxs2tS/ND4aRwCBR+AEAAAQwvxN47M1jigooP4JCAT2cQIAAAhxrjbOLSqSrr3W87VZWdKBA1XP2TgX8A+BEwAAQBiouXFuQYF311UPmiQ2zgX8RaoeAABAGPJ3/yfqnwD/EDgBAACEobrs/8TGuYDvCJwAAADClLvGEYmJ3l1fXBz4MQGNFTVOAAAAYcxV4wirVRo0yPO1tnOrX5uRYa5mAXBE4AQAABDmajaOsFrN+qeioqqaJleee0667jpzo10buu4BrpGqBwAA0MjUVv9U/fmSJY5Bk1TVdW/ZsvodIxBuCJwAAAAaIXf1T6mp0tKlUps2rq+r3nXvxAk2zwVsSNUDAABopFzVP2VkmM8PHnR/na3rXmqqtHdv1XHS+BDJCJwAAAAasZr1T5L33fSqB00Sm+cispGqBwAAEGGSk/27js1zEclYcQIAAIgwGRnedd1zxZbGV1BgrmbRxhyRgsAJAAAgwti67o0ZY3bZ8zV4kqSsLOnAgarn1D+hsSNVDwAAIAK567qXlOTd9dWDJok25mj8CJwAAAAiVGamVFgorVol5eaajzt3mqtHNfd/8oT6JzR2pOoBAABEMFdd9/xN46te/2QYFq1Zk6LmzS0aOJD6J4Q/VpwAAADgwF0aX2Kid9dnZUmDB0frscd6avDgaKWnk8KH8EfgBAAAACeu0viWLvXuWuqf0BiRqgcAAACXaqbxWa3+tTE3DDPtLydHGjGCtD2EJ1acAAAA4BVbG3PJv+YRO3ZIa9cGflxAQyBwAgAAgNfqWv9UVGQ2j1i82HykAx/CBal6AAAA8Elmpplyt3atVFwsJSebAdCgQZ6vve02ae/equdsnItwQeAEAAAAn/lb/1Q9aJKqGkfk5RE8IbSRqgcAAIA687f+iY1zES4InAAAABAQ7uqfkpJqv47GEQgHBE4AAAAIGNv+T/n5v2nmzE3Kz/9Nf/+7d9cWF9fr0IA6ocYJAAAAARUVJfXvb+jo0SL1799Dn37q3XXt25ud9mwNJzIy2PMJoYPACQAAAPUqI8Nz44joaOm66xxXnei4h1BCqh4AAADqlTeNI377zTlVz9Zxb9kys3EE+z8hmAicAAAAUO/cNY5ITZVat3Z9jW11asoUKT1dGjhQys42H9PTzYAKaChBDZzWrFmj4cOHq1OnTrJYLHrrrbc8XlNeXq67775bnTt3VmxsrNLT0/Xiiy/W/2ABAABQJ7bGEatWSbm55uOiRVJJiftrDEPav1/audPxePXVKKAhBLXG6ejRo+rRo4duuOEGZXqZvJqVlaVff/1VL7zwgrp166bi4mJVVlbW80gBAAAQCDU3zl282L/7GIaZ9peTI40YQRMJ1L+gBk7Dhg3TsGHDvD7//fff1+rVq7V161YlJiZKktLT0+tpdAAAAKhvycn+X1t9/6fqwRhQH8Kqq97bb7+tnj17at68eXrllVfUvHlzXX311brvvvvUrFkzl9eUl5ervLzc/ry0tFSSVFFRoYqKigYZd21sYwiFsSB8MG/gD+YN/MXcgT+8nTeXXCKlpERr1y7JMNx0jvBgx47fVFHhpl0fwkpD/3njy/uEVeC0detWrVu3TnFxcVq+fLn27dunqVOnav/+/Vq4cKHLa+bOnas5c+Y4Hf/www8VHx9f30P2Wn5+frCHgDDEvIE/mDfwF3MH/vBm3lx7bbIefvhCSYak6sFTzeeubdu2UQ8/LB08GKc2bY7rjDP2k7oX5hrqz5uysjKvz7UYhrtu+g3LYrFo+fLlGjlypNtzhgwZorVr12r37t1q1aqVJGnZsmUaM2aMjh496nLVydWKU1pamvbt26eEhISAfx++qqioUH5+vgYPHqyYmJhgDwdhgnkDfzBv4C/mDvzh67xZvtyimTOjVFRUFSilpBg6dkw6eND9alTTpobatZN27XK87rHHrBo1KiT+mQsfNPSfN6WlpWrXrp0OHTrkMTYIqxWn5ORkpaSk2IMmSTr99NNlGIZ27typU045xema2NhYxcbGOh2PiYkJqT/8Q208CA/MG/iDeQN/MXfgD2/nTVaWNHq0Wa9UXGzWPmVkWLRihdk9z2JxvXnuiRMW7drleGzXLovGjYtWXh6b54arhvrzxpf3CKt9nPr06aNdu3bpyJEj9mM//vijmjRpotTU1CCODAAAAHVl67g3frz5GBVV+/5P7hYIbAFWTg4b5SJwgho4HTlyRJs3b9bmzZslSdu2bdPmzZu1fft2SdJdd92liRMn2s/Pzs5W27Ztdf311+u7777TmjVrdPvtt+uGG25w2xwCAAAA4c3d/k//v+eXS9U77lmtUkGB2fq8oIBgCv4Jaqrepk2bNHDgQPvzmTNnSpImTZqkRYsWqbi42B5ESVKLFi2Un5+vW265RT179lTbtm2VlZWl+++/v8HHDgAAgIbj7/5PK1ZI113nuIFuaqr0xBOk8cE3QQ2cBgwYoNp6UyxatMjpWPfu3enqAwAAEOG83f/p8cedjxUVmXVT1EDBF2FV4wQAAABIUkaGuXJk8WPrJ2qg4A8CJwAAAISdqCgz3U5yDp68Caaq10AB3iBwAgAAQFiqreNeTo539ygqonEEvBNW+zgBAAAA1WVmSiNG1Nz/yXzuqr6ppttuk/burXpO4wi4Q+AEAACAsFaz455UVQNVVOR641yb6kGTROMIuEeqHgAAABqd2mqgakPjCLhD4AQAAIBGyV0NVFJS7dfROAKukKoHAACARstVDVRRkXTttZ6vtTWOqF47FRVV70NGiCJwAgAAQKNWswaqoMC762gcgepI1QMAAEBE8XbzXHeNI5Ytq7+xIXQROAEAACCi0DgC/iBwAgAAQMShcQR8RY0TAAAAIhKNI+ALAicAAABELBpHwFuk6gEAAAD/H40j4A6BEwAAAPD/0TgC7hA4AQAAANXQOAKuUOMEAAAA1EDjCNRE4AQAAAC4QOMIVEeqHgAAAOAFGkdENgInAAAAwAs0johsBE4AAACAl2gcEbmocQIAAAB8QOOIyETgBAAAAPiIxhGRh1Q9AAAAoI5oHNH4ETgBAAAAdUTjiMaPwAkAAAAIgLo2jigoML8WLzYfCaRCCzVOAAAAQIDUpXFEVpZ04EDVc+qfQguBEwAAABBA/jaOqB40SVX1T3l5BE+hgFQ9AAAAoB552ziiJuqfQguBEwAAAFCP/G0cIbFxbighcAIAAADqmbvGEYmJ3l1fXBz4McE31DgBAAAADcBV4wirVRo0yPO17dubtVK26zIyzJUsNBwCJwAAAKCB1GwcYbWa9U9FRVU1Ta7QcS/4SNUDAAAAgqS2+qfqz9113Fu2rH7HhyoETgAAAEAQuat/SkmR2rZ1fQ0d9xoeqXoAAABAkPlT/2TruFdQYK5cUf9UvwicAAAAgBBQs/5p8WLvrqP+qWGQqgcAAACEoORk786j/qlhEDgBAAAAISgjw1w98mfTXIn6p0AjcAIAAABCUG0d9zyx1T+tXRv4cUUqAicAAAAgRLnruJeY6N31xcXmqlNBgVkzVVDAKpS/aA4BAAAAhDB/Ou7Z/PSTlJ4u7dxZdYzmEf4hcAIAAABCXM2Oe1arGQAVFVXVNLkya5bzMVvziLw8gidfkKoHAAAAhJna6p881UPRPMI/BE4AAABAGHJX/5SaKs2ZU/u1NI/wHYETAAAAEKYyM6XCQmnVKik313zctk065RTvri8urtfhNSrUOAEAAABhrGb9k+T95rnengcCJwAAAKDRsW2eW1vziE6dpN69zRbltm59GRlmIAZnBE4AAABAI2NrHjFmjNkswlXwdPSo1LmztHt31TFalbtHjRMAAADQCLlrHpGcLCUkSIcOOQZNUlWr8mXLGm6c4YLACQAAAGikXDWPKCyUmjd3fT6tyt0jVQ8AAABoxGo2j7DVNLlja1VeUGBeS/2TicAJAAAAiCDetiDPypIOHKh6Hun1T0FN1VuzZo2GDx+uTp06yWKx6K233vL62k8//VTR0dE699xz6218AAAAQGPjbQvy6kGT5Fj/ZLWaK1KLF5uPkZDWF9TA6ejRo+rRo4eefvppn64rKSnRxIkTddlll9XTyAAAAIDGydaq3GLx7Tpb/dOUKVJ6ujRwoJSdbT6mpzf+hhJBTdUbNmyYhg0b5vN1N910k7KzsxUVFeXTKhUAAAAQ6bxpVe6OYUj79zsft61G5eU13lS+sKtxWrhwobZu3apXX31V999/v8fzy8vLVV5ebn9eWloqSaqoqFBFRUW9jdNbtjGEwlgQPpg38AfzBv5i7sAfzJvQNny4tGSJRTNnRqmoqGrpKTHR0IEDPi5FyQyoLBZDM2ZIV1zxm99NJBp63vjyPmEVOP3000+68847tXbtWkVHezf0uXPnas6cOU7HP/zwQ8XHxwd6iH7Lz88P9hAQhpg38AfzBv5i7sAfzJvQFRsrPfmk9N13bXXwYJzatDmuykpp1qy+ft3PMCzauVN69NF/6+yzXSxL+aCh5k1ZWZnX54ZN4GS1WpWdna05c+bo1FNP9fq6u+66SzNnzrQ/Ly0tVVpamoYMGaKEhIT6GKpPKioqlJ+fr8GDBysmJibYw0GYYN7AH8wb+Iu5A38wb8LH8OFVv7ZapQULDO3aZQZC/ujc+RJdcYUP+X/VNPS8sWWjeSNsAqfDhw9r06ZN+uqrrzR9+nRJUmVlpQzDUHR0tD788ENdeumlTtfFxsYqNjbW6XhMTExI/RCH2ngQHpg38AfzBv5i7sAfzJvwEhNjrkL5U/9kk5YWrbr+ljfUvPHlPcImcEpISNDXX3/tcOyZZ57RJ598ory8PHXp0iVIIwMAAAAaj8xMs8nDjBnSzp1Vx1NTpWPHzDbl7gKq1FSza19jFNTA6ciRI/rf//5nf75t2zZt3rxZiYmJOumkk3TXXXepqKhIL7/8spo0aaKzzjrL4fr27dsrLi7O6TgAAAAA/2VmSiNGSGvXmhvmJiebAdGKFbWvRp10kpnuV/M6f5tFhJKgBk6bNm3SwIED7c9ttUiTJk3SokWLVFxcrO3btwdreAAAAEDEioqSBgxwPOZuNSopyVyJWr9eSkyUjh6tei011Wx/Hu5tyoO6Ae6AAQNkGIbT16JFiyRJixYtUkFBgdvrZ8+erc2bNzfIWAEAAACYAVBhobRqlZSbaz4WF0s5Oebr1YMmqWqPp3DfIDdsapwAAAAAhIaaq1FWq/T6667PNfd4MgOrESPCN20vqCtOAAAAAMLf2rWOqXs1GYa0Y4d5XrhixQkAAABAnRQXe3deUZFUUBCejSMInAAAAADUSXKyd+fl5Ej79lU9D6fGEaTqAQAAAKiTjAwzCLJYaj+vetAkhVfjCAInAAAAAHUSFWWuHEmeg6fqbHtB5eSYDSZCGYETAAAAgDqz7fGUkuJ4PCmp9uvCpXEENU4AAAAAAiIz02w5vnZtVQOIoiLp2ms9X+ttg4lgIXACAAAAEDA193gqKPDuOm8bTAQLqXoAAAAA6o2nxhEWi5SWZp4XygicAAAAANSb2hpH2J4//njo7+dE4AQAAACgXrlrHJGaah4Ph32cqHECAAAAUO9cNY7IyAj9lSYbAicAAAAADaJm44hwQqoeAAAAAHhA4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAeEDgBAAAAgAcETgAAAADgAYETAAAAAHgQHewBNDTDMCRJpaWlQR6JqaKiQmVlZSotLVVMTEywh4MwwbyBP5g38BdzB/5g3sAfDT1vbDGBLUaoTcQFTocPH5YkpaWlBXkkAAAAAELB4cOH1apVq1rPsRjehFeNSGVlpXbt2qWWLVvKYrEEezgqLS1VWlqaduzYoYSEhGAPB2GCeQN/MG/gL+YO/MG8gT8aet4YhqHDhw+rU6dOatKk9iqmiFtxatKkiVJTU4M9DCcJCQn8oQKfMW/gD+YN/MXcgT+YN/BHQ84bTytNNjSHAAAAAAAPCJwAAAAAwAMCpyCLjY3VrFmzFBsbG+yhIIwwb+AP5g38xdyBP5g38Ecoz5uIaw4BAAAAAL5ixQkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMApiJ5++mmlp6crLi5OF198sT777LNgDwkhZO7cubrwwgvVsmVLtW/fXiNHjtSWLVsczjl+/LimTZumtm3bqkWLFho9erR+/fXXII0Yoeihhx6SxWJRTk6O/RjzBu4UFRXp2muvVdu2bdWsWTOdffbZ2rRpk/11wzB07733Kjk5Wc2aNdOgQYP0008/BXHECDar1ap77rlHXbp0UbNmzdS1a1fdd999qt57jHkDSVqzZo2GDx+uTp06yWKx6K233nJ43Zt5cuDAAU2YMEEJCQlq3bq1brzxRh05cqTBvgcCpyB5/fXXNXPmTM2aNUtffvmlevTooaFDh2rPnj3BHhpCxOrVqzVt2jRt3LhR+fn5qqio0JAhQ3T06FH7ObfddpveeecdvfHGG1q9erV27dqlzMzMII4aoeTzzz/Xc889p3POOcfhOPMGrhw8eFB9+vRRTEyM3nvvPX333Xf629/+pjZt2tjPmTdvnp588kk9++yz+ve//63mzZtr6NChOn78eBBHjmB6+OGHNX/+fD311FP6/vvv9fDDD2vevHn6xz/+YT+HeQNJOnr0qHr06KGnn37a5evezJMJEybo22+/VX5+vlauXKk1a9ZoypQpDfUtSAaC4qKLLjKmTZtmf261Wo1OnToZc+fODeKoEMr27NljSDJWr15tGIZhlJSUGDExMcYbb7xhP+f77783JBkbNmwI1jARIg4fPmyccsopRn5+vtG/f39jxowZhmEwb+Den/70J6Nv375uX6+srDQ6duxoPPLII/ZjJSUlRmxsrLF48eKGGCJC0JVXXmnccMMNDscyMzONCRMmGIbBvIFrkozly5fbn3szT7777jtDkvH555/bz3nvvfcMi8ViFBUVNci4WXEKghMnTuiLL77QoEGD7MeaNGmiQYMGacOGDUEcGULZoUOHJEmJiYmSpC+++EIVFRUO86h79+466aSTmEfQtGnTdOWVVzrMD4l5A/fefvtt9ezZU2PHjlX79u113nnn6fnnn7e/vm3bNu3evdth7rRq1UoXX3wxcyeC9e7dWx9//LF+/PFHSdJ//vMfrVu3TsOGDZPEvIF3vJknGzZsUOvWrdWzZ0/7OYMGDVKTJk3073//u0HGGd0g7wIH+/btk9VqVYcOHRyOd+jQQT/88EOQRoVQVllZqZycHPXp00dnnXWWJGn37t1q2rSpWrdu7XBuhw4dtHv37iCMEqFiyZIl+vLLL/X55587vca8gTtbt27V/PnzNXPmTP35z3/W559/rltvvVVNmzbVpEmT7PPD1d9dzJ3Ideedd6q0tFTdu3dXVFSUrFarHnjgAU2YMEGSmDfwijfzZPfu3Wrfvr3D69HR0UpMTGywuUTgBISBadOm6ZtvvtG6deuCPRSEuB07dmjGjBnKz89XXFxcsIeDMFJZWamePXvqwQcflCSdd955+uabb/Tss89q0qRJQR4dQtXSpUv12muvKTc3V2eeeaY2b96snJwcderUiXmDRodUvSBo166doqKinLpY/frrr+rYsWOQRoVQNX36dK1cuVKrVq1Samqq/XjHjh114sQJlZSUOJzPPIpsX3zxhfbs2aPzzz9f0dHRio6O1urVq/Xkk08qOjpaHTp0YN7ApeTkZJ1xxhkOx04//XRt375dkuzzg7+7UN3tt9+uO++8U+PGjdPZZ5+t6667Trfddpvmzp0riXkD73gzTzp27OjURO23337TgQMHGmwuETgFQdOmTXXBBRfo448/th+rrKzUxx9/rF69egVxZAglhmFo+vTpWr58uT755BN16dLF4fULLrhAMTExDvNoy5Yt2r59O/Mogl122WX6+uuvtXnzZvtXz549NWHCBPuvmTdwpU+fPk5bHvz444/q3LmzJKlLly7q2LGjw9wpLS3Vv//9b+ZOBCsrK1OTJo7/nIyKilJlZaUk5g2848086dWrl0pKSvTFF1/Yz/nkk09UWVmpiy++uGEG2iAtKOBkyZIlRmxsrLFo0SLju+++M6ZMmWK0bt3a2L17d7CHhhBx8803G61atTIKCgqM4uJi+1dZWZn9nJtuusk46aSTjE8++cTYtGmT0atXL6NXr15BHDVCUfWueobBvIFrn332mREdHW088MADxk8//WS89tprRnx8vPHqq6/az3nooYeM1q1bGytWrDD++9//GiNGjDC6dOliHDt2LIgjRzBNmjTJSElJMVauXGls27bNWLZsmdGuXTvjjjvusJ/DvIFhmN1ev/rqK+Orr74yJBmPPfaY8dVXXxm//PKLYRjezZPLL7/cOO+884x///vfxrp164xTTjnFGD9+fIN9DwROQfSPf/zDOOmkk4ymTZsaF110kbFx48ZgDwkhRJLLr4ULF9rPOXbsmDF16lSjTZs2Rnx8vDFq1CijuLg4eINGSKoZODFv4M4777xjnHXWWUZsbKzRvXt3Y8GCBQ6vV1ZWGvfcc4/RoUMHIzY21rjsssuMLVu2BGm0CAWlpaXGjBkzjJNOOsmIi4szTj75ZOPuu+82ysvL7ecwb2AYhrFq1SqX/66ZNGmSYRjezZP9+/cb48ePN1q0aGEkJCQY119/vXH48OEG+x4shlFta2cAAAAAgBNqnAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAACohcVi0VtvvRXsYQAAgozACQAQsiZPniyLxeL0dfnllwd7aACACBMd7AEAAFCbyy+/XAsXLnQ4FhsbG6TRAAAiFStOAICQFhsbq44dOzp8tWnTRpKZRjd//nwNGzZMzZo108knn6y8vDyH67/++mtdeumlatasmdq2baspU6boyJEjDue8+OKLOvPMMxUbG6vk5GRNnz7d4fV9+/Zp1KhRio+P1ymnnKK3337b/trBgwc1YcIEJSUlqVmzZjrllFOcAj0AQPgjcAIAhLV77rlHo0eP1n/+8x9NmDBB48aN0/fffy9JOnr0qIYOHao2bdro888/1xtvvKGPPvrIITCaP3++pk2bpilTpujrr7/W22+/rW7dujm8x5w5c5SVlaX//ve/uuKKKzRhwgQdOHDA/v7fffed3nvvPX3//feaP3++2rVr13AfAACgQVgMwzCCPQgAAFyZPHmyXn31VcXFxTkc//Of/6w///nPslgsuummmzR//nz7a5dcconOP/98PfPMM3r++ef1pz/9STt27FDz5s0lSe+++66GDx+uXbt2qUOHDkpJSdH111+v+++/3+UYLBaL/vKXv+i+++6TZAZjLVq00HvvvafLL79cV199tdq1a6cXX3yxnj4FAEAooMYJABDSBg4c6BAYSVJiYqL917169XJ4rVevXtq8ebMk6fvvv1ePHj3sQZMk9enTR5WVldqyZYssFot27dqlyy67rNYxnHPOOfZfN2/eXAkJCdqzZ48k6eabb9bo0aP15ZdfasiQIRo5cqR69+7t1/cKAAhdBE4AgJDWvHlzp9S5QGnWrJlX58XExDg8t1gsqqyslCQNGzZMv/zyi959913l5+frsssu07Rp0/Too48GfLwAgOChxgkAENY2btzo9Pz000+XJJ1++un6z3/+o6NHj9pf//TTT9WkSROddtppatmypdLT0/Xxxx/XaQxJSUmaNGmSXn31VT3++ONasGBBne4HAAg9rDgBAEJaeXm5du/e7XAsOjra3oDhjTfeUM+ePdW3b1+99tpr+uyzz/TCCy9IkiZMmKBZs2Zp0qRJmj17tvbu3atbbrlF1113nTp06CBJmj17tm666Sa1b99ew4YN0+HDh/Xpp5/qlltu8Wp89957ry644AKdeeaZKi8v18qVK+2BGwCg8SBwAgCEtPfff1/JyckOx0477TT98MMPksyOd0uWLNHUqVOVnJysxYsX64wzzpAkxcfH64MPPtCMGTN04YUXKj4+XqNHj9Zjjz1mv9ekSZN0/Phx/f3vf9cf//hHtWvXTmPGjPF6fE2bNtVdd92lwsJCNWvWTBkZGVqyZEkAvnMAQCihqx4AIGxZLBYtX75cI0eODPZQAACNHDVOAAAAAOABgRMAAAAAeECNEwAgbJFtDgBoKKw4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAe/D9KcbfSZkpy3gAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "<Figure size 1000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "epochs = list(range(configs_dict[\"max_training_steps\"]))\n",
+    "loss_values = ft_res[0].finetuning_losses\n",
+    "\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')\n",
+    "\n",
+    "# Set plot labels and title\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss Value')\n",
+    "plt.title('Loss Value vs. Number of Epochs')\n",
+    "\n",
+    "plt.grid(True)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save finetuned model to HuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subprocess.run(['python', '../../utils/upload_peft_model.py'] + f\"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly\".split())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Stop LLM Co-serving system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-22 06:46:20 - ###PEFT DEBUGGING### Background serving task completed.\n",
+      "Background server stopped.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm.stop_server()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inference all over again with the finetuned model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n",
+      "Loading tokenizer...\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n",
+      "Loading tokenizer...\n",
+      "[0 - 7ff1caf83280]    0.270628 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270673 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270699 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270744 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270753 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "workSpaceSize (128 MB)\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n",
+      "Loading tokenizer...\n",
+      "Adding layer layers.0.mlp.down_proj.lora\n",
+      "Adding layer layers.1.mlp.down_proj.lora\n",
+      "Adding layer layers.2.mlp.down_proj.lora\n",
+      "Adding layer layers.3.mlp.down_proj.lora\n",
+      "Adding layer layers.4.mlp.down_proj.lora\n",
+      "Adding layer layers.5.mlp.down_proj.lora\n",
+      "Adding layer layers.6.mlp.down_proj.lora\n",
+      "Adding layer layers.7.mlp.down_proj.lora\n",
+      "Adding layer layers.8.mlp.down_proj.lora\n",
+      "Adding layer layers.9.mlp.down_proj.lora\n",
+      "Adding layer layers.10.mlp.down_proj.lora\n",
+      "Adding layer layers.11.mlp.down_proj.lora\n",
+      "Adding layer layers.12.mlp.down_proj.lora\n",
+      "Adding layer layers.13.mlp.down_proj.lora\n",
+      "Adding layer layers.14.mlp.down_proj.lora\n",
+      "Adding layer layers.15.mlp.down_proj.lora\n",
+      "Adding layer layers.16.mlp.down_proj.lora\n",
+      "Adding layer layers.17.mlp.down_proj.lora\n",
+      "Adding layer layers.18.mlp.down_proj.lora\n",
+      "Adding layer layers.19.mlp.down_proj.lora\n",
+      "Adding layer layers.20.mlp.down_proj.lora\n",
+      "Adding layer layers.21.mlp.down_proj.lora\n",
+      "Adding layer layers.22.mlp.down_proj.lora\n",
+      "Adding layer layers.23.mlp.down_proj.lora\n",
+      "Adding layer layers.24.mlp.down_proj.lora\n",
+      "Adding layer layers.25.mlp.down_proj.lora\n",
+      "Adding layer layers.26.mlp.down_proj.lora\n",
+      "Adding layer layers.27.mlp.down_proj.lora\n",
+      "Adding layer layers.28.mlp.down_proj.lora\n",
+      "Adding layer layers.29.mlp.down_proj.lora\n",
+      "Adding layer layers.30.mlp.down_proj.lora\n",
+      "Adding layer layers.31.mlp.down_proj.lora\n",
+      "Background server started.\n",
+      "[<flexflow.core.flexflow_cffi.Request object at 0x7ff16b115bd0>]\n",
+      "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Starting background serving task.\n",
+      "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Updated models' configuration.\n",
+      "###PEFT DEBUGGING### LLM Model object exists.\n",
+      "###PEFT DEBUGGING### Model object exists.\n",
+      "###PEFT DEBUGGING### Model object still exists.\n",
+      "###PEFT DEBUGGING### Entering compile_inference.\n",
+      "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n",
+      "###PEFT DEBUGGING### Launching graph optimization task.\n",
+      "num_nodes = 1 num_gpus_per_node = 1\n",
+      "[0]10445\n",
+      "[1]649\n",
+      "[2]6730\n",
+      "[3]2053\n",
+      "[4]18167\n",
+      "[5]369\n",
+      "[6]1317\n",
+      "[7]2085\n",
+      "[8]3090\n",
+      "[9]30\n",
+      "No small speculative model registered, using incremental decoding.\n",
+      "[0 - 7ff1caf83280]    1.100415 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n",
+      "optimal_views.size = 262\n",
+      "views.size() = 262\n",
+      "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n",
+      "###PEFT DEBUGGING### Starting inplace optimizations.\n",
+      "###PEFT DEBUGGING### Mapping output tensors.\n",
+      "ndim(1) dims[1 0 0 0]\n",
+      "###PEFT DEBUGGING### Setting up NCCL communications.\n",
+      "###PEFT DEBUGGING### compile_inference completed successfully.\n",
+      "Loading weight file embed_tokens.weight\n",
+      "Loading weight file layers.0.input_layernorm.weight\n",
+      "Loading weight file layers.0.self_attn.q_proj.weight\n",
+      "Loading weight file layers.0.self_attn.k_proj.weight\n",
+      "Loading weight file layers.0.self_attn.v_proj.weight\n",
+      "Loading weight file layers.0.self_attn.o_proj.weight\n",
+      "Loading weight file layers.0.post_attention_layernorm.weight\n",
+      "Loading weight file layers.0.mlp.gate_proj.weight\n",
+      "Loading weight file layers.0.mlp.up_proj.weight\n",
+      "Loading weight file layers.0.mlp.down_proj.weight\n",
+      "Loading weight file layers.1.input_layernorm.weight\n",
+      "Loading weight file layers.1.self_attn.q_proj.weight\n",
+      "Loading weight file layers.1.self_attn.k_proj.weight\n",
+      "Loading weight file layers.1.self_attn.v_proj.weight\n",
+      "Loading weight file layers.1.self_attn.o_proj.weight\n",
+      "Loading weight file layers.1.post_attention_layernorm.weight\n",
+      "Loading weight file layers.1.mlp.gate_proj.weight\n",
+      "Loading weight file layers.1.mlp.up_proj.weight\n",
+      "Loading weight file layers.1.mlp.down_proj.weight\n",
+      "Loading weight file layers.2.input_layernorm.weight\n",
+      "Loading weight file layers.2.self_attn.q_proj.weight\n",
+      "Loading weight file layers.2.self_attn.k_proj.weight\n",
+      "Loading weight file layers.2.self_attn.v_proj.weight\n",
+      "Loading weight file layers.2.self_attn.o_proj.weight\n",
+      "Loading weight file layers.2.post_attention_layernorm.weight\n",
+      "Loading weight file layers.2.mlp.gate_proj.weight\n",
+      "Loading weight file layers.2.mlp.up_proj.weight\n",
+      "Loading weight file layers.2.mlp.down_proj.weight\n",
+      "Loading weight file layers.3.input_layernorm.weight\n",
+      "Loading weight file layers.3.self_attn.q_proj.weight\n",
+      "Loading weight file layers.3.self_attn.k_proj.weight\n",
+      "Loading weight file layers.3.self_attn.v_proj.weight\n",
+      "Loading weight file layers.3.self_attn.o_proj.weight\n",
+      "Loading weight file layers.3.post_attention_layernorm.weight\n",
+      "Loading weight file layers.3.mlp.gate_proj.weight\n",
+      "Loading weight file layers.3.mlp.up_proj.weight\n",
+      "Loading weight file layers.3.mlp.down_proj.weight\n",
+      "Loading weight file layers.4.input_layernorm.weight\n",
+      "Loading weight file layers.4.self_attn.q_proj.weight\n",
+      "Loading weight file layers.4.self_attn.k_proj.weight\n",
+      "Loading weight file layers.4.self_attn.v_proj.weight\n",
+      "Loading weight file layers.4.self_attn.o_proj.weight\n",
+      "Loading weight file layers.4.post_attention_layernorm.weight\n",
+      "Loading weight file layers.4.mlp.gate_proj.weight\n",
+      "Loading weight file layers.4.mlp.up_proj.weight\n",
+      "Loading weight file layers.4.mlp.down_proj.weight\n",
+      "Loading weight file layers.5.input_layernorm.weight\n",
+      "Loading weight file layers.5.self_attn.q_proj.weight\n",
+      "Loading weight file layers.5.self_attn.k_proj.weight\n",
+      "Loading weight file layers.5.self_attn.v_proj.weight\n",
+      "Loading weight file layers.5.self_attn.o_proj.weight\n",
+      "Loading weight file layers.5.post_attention_layernorm.weight\n",
+      "Loading weight file layers.5.mlp.gate_proj.weight\n",
+      "Loading weight file layers.5.mlp.up_proj.weight\n",
+      "Loading weight file layers.5.mlp.down_proj.weight\n",
+      "Loading weight file layers.6.input_layernorm.weight\n",
+      "Loading weight file layers.6.self_attn.q_proj.weight\n",
+      "Loading weight file layers.6.self_attn.k_proj.weight\n",
+      "Loading weight file layers.6.self_attn.v_proj.weight\n",
+      "Loading weight file layers.6.self_attn.o_proj.weight\n",
+      "Loading weight file layers.6.post_attention_layernorm.weight\n",
+      "Loading weight file layers.6.mlp.gate_proj.weight\n",
+      "Loading weight file layers.6.mlp.up_proj.weight\n",
+      "Loading weight file layers.6.mlp.down_proj.weight\n",
+      "Loading weight file layers.7.input_layernorm.weight\n",
+      "Loading weight file layers.7.self_attn.q_proj.weight\n",
+      "Loading weight file layers.7.self_attn.k_proj.weight\n",
+      "Loading weight file layers.7.self_attn.v_proj.weight\n",
+      "Loading weight file layers.7.self_attn.o_proj.weight\n",
+      "Loading weight file layers.7.post_attention_layernorm.weight\n",
+      "Loading weight file layers.7.mlp.gate_proj.weight\n",
+      "Loading weight file layers.7.mlp.up_proj.weight\n",
+      "Loading weight file layers.7.mlp.down_proj.weight\n",
+      "Loading weight file layers.8.input_layernorm.weight\n",
+      "Loading weight file layers.8.self_attn.q_proj.weight\n",
+      "Loading weight file layers.8.self_attn.k_proj.weight\n",
+      "Loading weight file layers.8.self_attn.v_proj.weight\n",
+      "Loading weight file layers.8.self_attn.o_proj.weight\n",
+      "Loading weight file layers.8.post_attention_layernorm.weight\n",
+      "Loading weight file layers.8.mlp.gate_proj.weight\n",
+      "Loading weight file layers.8.mlp.up_proj.weight\n",
+      "Loading weight file layers.8.mlp.down_proj.weight\n",
+      "Loading weight file layers.9.input_layernorm.weight\n",
+      "Loading weight file layers.9.self_attn.q_proj.weight\n",
+      "Loading weight file layers.9.self_attn.k_proj.weight\n",
+      "Loading weight file layers.9.self_attn.v_proj.weight\n",
+      "Loading weight file layers.9.self_attn.o_proj.weight\n",
+      "Loading weight file layers.9.post_attention_layernorm.weight\n",
+      "Loading weight file layers.9.mlp.gate_proj.weight\n",
+      "Loading weight file layers.9.mlp.up_proj.weight\n",
+      "Loading weight file layers.9.mlp.down_proj.weight\n",
+      "Loading weight file layers.10.input_layernorm.weight\n",
+      "Loading weight file layers.10.self_attn.q_proj.weight\n",
+      "Loading weight file layers.10.self_attn.k_proj.weight\n",
+      "Loading weight file layers.10.self_attn.v_proj.weight\n",
+      "Loading weight file layers.10.self_attn.o_proj.weight\n",
+      "Loading weight file layers.10.post_attention_layernorm.weight\n",
+      "Loading weight file layers.10.mlp.gate_proj.weight\n",
+      "Loading weight file layers.10.mlp.up_proj.weight\n",
+      "Loading weight file layers.10.mlp.down_proj.weight\n",
+      "Loading weight file layers.11.input_layernorm.weight\n",
+      "Loading weight file layers.11.self_attn.q_proj.weight\n",
+      "Loading weight file layers.11.self_attn.k_proj.weight\n",
+      "Loading weight file layers.11.self_attn.v_proj.weight\n",
+      "Loading weight file layers.11.self_attn.o_proj.weight\n",
+      "Loading weight file layers.11.post_attention_layernorm.weight\n",
+      "Loading weight file layers.11.mlp.gate_proj.weight\n",
+      "Loading weight file layers.11.mlp.up_proj.weight\n",
+      "Loading weight file layers.11.mlp.down_proj.weight\n",
+      "Loading weight file layers.12.input_layernorm.weight\n",
+      "Loading weight file layers.12.self_attn.q_proj.weight\n",
+      "Loading weight file layers.12.self_attn.k_proj.weight\n",
+      "Loading weight file layers.12.self_attn.v_proj.weight\n",
+      "Loading weight file layers.12.self_attn.o_proj.weight\n",
+      "Loading weight file layers.12.post_attention_layernorm.weight\n",
+      "Loading weight file layers.12.mlp.gate_proj.weight\n",
+      "Loading weight file layers.12.mlp.up_proj.weight\n",
+      "Loading weight file layers.12.mlp.down_proj.weight\n",
+      "Loading weight file layers.13.input_layernorm.weight\n",
+      "Loading weight file layers.13.self_attn.q_proj.weight\n",
+      "Loading weight file layers.13.self_attn.k_proj.weight\n",
+      "Loading weight file layers.13.self_attn.v_proj.weight\n",
+      "Loading weight file layers.13.self_attn.o_proj.weight\n",
+      "Loading weight file layers.13.post_attention_layernorm.weight\n",
+      "Loading weight file layers.13.mlp.gate_proj.weight\n",
+      "Loading weight file layers.13.mlp.up_proj.weight\n",
+      "Loading weight file layers.13.mlp.down_proj.weight\n",
+      "Loading weight file layers.14.input_layernorm.weight\n",
+      "Loading weight file layers.14.self_attn.q_proj.weight\n",
+      "Loading weight file layers.14.self_attn.k_proj.weight\n",
+      "Loading weight file layers.14.self_attn.v_proj.weight\n",
+      "Loading weight file layers.14.self_attn.o_proj.weight\n",
+      "Loading weight file layers.14.post_attention_layernorm.weight\n",
+      "Loading weight file layers.14.mlp.gate_proj.weight\n",
+      "Loading weight file layers.14.mlp.up_proj.weight\n",
+      "Loading weight file layers.14.mlp.down_proj.weight\n",
+      "Loading weight file layers.15.input_layernorm.weight\n",
+      "Loading weight file layers.15.self_attn.q_proj.weight\n",
+      "Loading weight file layers.15.self_attn.k_proj.weight\n",
+      "Loading weight file layers.15.self_attn.v_proj.weight\n",
+      "Loading weight file layers.15.self_attn.o_proj.weight\n",
+      "Loading weight file layers.15.post_attention_layernorm.weight\n",
+      "Loading weight file layers.15.mlp.gate_proj.weight\n",
+      "Loading weight file layers.15.mlp.up_proj.weight\n",
+      "Loading weight file layers.15.mlp.down_proj.weight\n",
+      "Loading weight file layers.16.input_layernorm.weight\n",
+      "Loading weight file layers.16.self_attn.q_proj.weight\n",
+      "Loading weight file layers.16.self_attn.k_proj.weight\n",
+      "Loading weight file layers.16.self_attn.v_proj.weight\n",
+      "Loading weight file layers.16.self_attn.o_proj.weight\n",
+      "Loading weight file layers.16.post_attention_layernorm.weight\n",
+      "Loading weight file layers.16.mlp.gate_proj.weight\n",
+      "Loading weight file layers.16.mlp.up_proj.weight\n",
+      "Loading weight file layers.16.mlp.down_proj.weight\n",
+      "Loading weight file layers.17.input_layernorm.weight\n",
+      "Loading weight file layers.17.self_attn.q_proj.weight\n",
+      "Loading weight file layers.17.self_attn.k_proj.weight\n",
+      "Loading weight file layers.17.self_attn.v_proj.weight\n",
+      "Loading weight file layers.17.self_attn.o_proj.weight\n",
+      "Loading weight file layers.17.post_attention_layernorm.weight\n",
+      "Loading weight file layers.17.mlp.gate_proj.weight\n",
+      "Loading weight file layers.17.mlp.up_proj.weight\n",
+      "Loading weight file layers.17.mlp.down_proj.weight\n",
+      "Loading weight file layers.18.input_layernorm.weight\n",
+      "Loading weight file layers.18.self_attn.q_proj.weight\n",
+      "Loading weight file layers.18.self_attn.k_proj.weight\n",
+      "Loading weight file layers.18.self_attn.v_proj.weight\n",
+      "Loading weight file layers.18.self_attn.o_proj.weight\n",
+      "Loading weight file layers.18.post_attention_layernorm.weight\n",
+      "Loading weight file layers.18.mlp.gate_proj.weight\n",
+      "Loading weight file layers.18.mlp.up_proj.weight\n",
+      "Loading weight file layers.18.mlp.down_proj.weight\n",
+      "Loading weight file layers.19.input_layernorm.weight\n",
+      "Loading weight file layers.19.self_attn.q_proj.weight\n",
+      "Loading weight file layers.19.self_attn.k_proj.weight\n",
+      "Loading weight file layers.19.self_attn.v_proj.weight\n",
+      "Loading weight file layers.19.self_attn.o_proj.weight\n",
+      "Loading weight file layers.19.post_attention_layernorm.weight\n",
+      "Loading weight file layers.19.mlp.gate_proj.weight\n",
+      "Loading weight file layers.19.mlp.up_proj.weight\n",
+      "Loading weight file layers.19.mlp.down_proj.weight\n",
+      "Loading weight file layers.20.input_layernorm.weight\n",
+      "Loading weight file layers.20.self_attn.q_proj.weight\n",
+      "Loading weight file layers.20.self_attn.k_proj.weight\n",
+      "Loading weight file layers.20.self_attn.v_proj.weight\n",
+      "Loading weight file layers.20.self_attn.o_proj.weight\n",
+      "Loading weight file layers.20.post_attention_layernorm.weight\n",
+      "Loading weight file layers.20.mlp.gate_proj.weight\n",
+      "Loading weight file layers.20.mlp.up_proj.weight\n",
+      "Loading weight file layers.20.mlp.down_proj.weight\n",
+      "Loading weight file layers.21.input_layernorm.weight\n",
+      "Loading weight file layers.21.self_attn.q_proj.weight\n",
+      "Loading weight file layers.21.self_attn.k_proj.weight\n",
+      "Loading weight file layers.21.self_attn.v_proj.weight\n",
+      "Loading weight file layers.21.self_attn.o_proj.weight\n",
+      "Loading weight file layers.21.post_attention_layernorm.weight\n",
+      "Loading weight file layers.21.mlp.gate_proj.weight\n",
+      "Loading weight file layers.21.mlp.up_proj.weight\n",
+      "Loading weight file layers.21.mlp.down_proj.weight\n",
+      "Loading weight file layers.22.input_layernorm.weight\n",
+      "Loading weight file layers.22.self_attn.q_proj.weight\n",
+      "Loading weight file layers.22.self_attn.k_proj.weight\n",
+      "Loading weight file layers.22.self_attn.v_proj.weight\n",
+      "Loading weight file layers.22.self_attn.o_proj.weight\n",
+      "Loading weight file layers.22.post_attention_layernorm.weight\n",
+      "Loading weight file layers.22.mlp.gate_proj.weight\n",
+      "Loading weight file layers.22.mlp.up_proj.weight\n",
+      "Loading weight file layers.22.mlp.down_proj.weight\n",
+      "Loading weight file layers.23.input_layernorm.weight\n",
+      "Loading weight file layers.23.self_attn.q_proj.weight\n",
+      "Loading weight file layers.23.self_attn.k_proj.weight\n",
+      "Loading weight file layers.23.self_attn.v_proj.weight\n",
+      "Loading weight file layers.23.self_attn.o_proj.weight\n",
+      "Loading weight file layers.23.post_attention_layernorm.weight\n",
+      "Loading weight file layers.23.mlp.gate_proj.weight\n",
+      "Loading weight file layers.23.mlp.up_proj.weight\n",
+      "Loading weight file layers.23.mlp.down_proj.weight\n",
+      "Loading weight file layers.24.input_layernorm.weight\n",
+      "Loading weight file layers.24.self_attn.q_proj.weight\n",
+      "Loading weight file layers.24.self_attn.k_proj.weight\n",
+      "Loading weight file layers.24.self_attn.v_proj.weight\n",
+      "Loading weight file layers.24.self_attn.o_proj.weight\n",
+      "Loading weight file layers.24.post_attention_layernorm.weight\n",
+      "Loading weight file layers.24.mlp.gate_proj.weight\n",
+      "Loading weight file layers.24.mlp.up_proj.weight\n",
+      "Loading weight file layers.24.mlp.down_proj.weight\n",
+      "Loading weight file layers.25.input_layernorm.weight\n",
+      "Loading weight file layers.25.self_attn.q_proj.weight\n",
+      "Loading weight file layers.25.self_attn.k_proj.weight\n",
+      "Loading weight file layers.25.self_attn.v_proj.weight\n",
+      "Loading weight file layers.25.self_attn.o_proj.weight\n",
+      "Loading weight file layers.25.post_attention_layernorm.weight\n",
+      "Loading weight file layers.25.mlp.gate_proj.weight\n",
+      "Loading weight file layers.25.mlp.up_proj.weight\n",
+      "Loading weight file layers.25.mlp.down_proj.weight\n",
+      "Loading weight file layers.26.input_layernorm.weight\n",
+      "Loading weight file layers.26.self_attn.q_proj.weight\n",
+      "Loading weight file layers.26.self_attn.k_proj.weight\n",
+      "Loading weight file layers.26.self_attn.v_proj.weight\n",
+      "Loading weight file layers.26.self_attn.o_proj.weight\n",
+      "Loading weight file layers.26.post_attention_layernorm.weight\n",
+      "Loading weight file layers.26.mlp.gate_proj.weight\n",
+      "Loading weight file layers.26.mlp.up_proj.weight\n",
+      "Loading weight file layers.26.mlp.down_proj.weight\n",
+      "Loading weight file layers.27.input_layernorm.weight\n",
+      "Loading weight file layers.27.self_attn.q_proj.weight\n",
+      "Loading weight file layers.27.self_attn.k_proj.weight\n",
+      "Loading weight file layers.27.self_attn.v_proj.weight\n",
+      "Loading weight file layers.27.self_attn.o_proj.weight\n",
+      "Loading weight file layers.27.post_attention_layernorm.weight\n",
+      "Loading weight file layers.27.mlp.gate_proj.weight\n",
+      "Loading weight file layers.27.mlp.up_proj.weight\n",
+      "Loading weight file layers.27.mlp.down_proj.weight\n",
+      "Loading weight file layers.28.input_layernorm.weight\n",
+      "Loading weight file layers.28.self_attn.q_proj.weight\n",
+      "Loading weight file layers.28.self_attn.k_proj.weight\n",
+      "Loading weight file layers.28.self_attn.v_proj.weight\n",
+      "Loading weight file layers.28.self_attn.o_proj.weight\n",
+      "Loading weight file layers.28.post_attention_layernorm.weight\n",
+      "Loading weight file layers.28.mlp.gate_proj.weight\n",
+      "Loading weight file layers.28.mlp.up_proj.weight\n",
+      "Loading weight file layers.28.mlp.down_proj.weight\n",
+      "Loading weight file layers.29.input_layernorm.weight\n",
+      "Loading weight file layers.29.self_attn.q_proj.weight\n",
+      "Loading weight file layers.29.self_attn.k_proj.weight\n",
+      "Loading weight file layers.29.self_attn.v_proj.weight\n",
+      "Loading weight file layers.29.self_attn.o_proj.weight\n",
+      "Loading weight file layers.29.post_attention_layernorm.weight\n",
+      "Loading weight file layers.29.mlp.gate_proj.weight\n",
+      "Loading weight file layers.29.mlp.up_proj.weight\n",
+      "Loading weight file layers.29.mlp.down_proj.weight\n",
+      "Loading weight file layers.30.input_layernorm.weight\n",
+      "Loading weight file layers.30.self_attn.q_proj.weight\n",
+      "Loading weight file layers.30.self_attn.k_proj.weight\n",
+      "Loading weight file layers.30.self_attn.v_proj.weight\n",
+      "Loading weight file layers.30.self_attn.o_proj.weight\n",
+      "Loading weight file layers.30.post_attention_layernorm.weight\n",
+      "Loading weight file layers.30.mlp.gate_proj.weight\n",
+      "Loading weight file layers.30.mlp.up_proj.weight\n",
+      "Loading weight file layers.30.mlp.down_proj.weight\n",
+      "Loading weight file layers.31.input_layernorm.weight\n",
+      "Loading weight file layers.31.self_attn.q_proj.weight\n",
+      "Loading weight file layers.31.self_attn.k_proj.weight\n",
+      "Loading weight file layers.31.self_attn.v_proj.weight\n",
+      "Loading weight file layers.31.self_attn.o_proj.weight\n",
+      "Loading weight file layers.31.post_attention_layernorm.weight\n",
+      "Loading weight file layers.31.mlp.gate_proj.weight\n",
+      "Loading weight file layers.31.mlp.up_proj.weight\n",
+      "Loading weight file layers.31.mlp.down_proj.weight\n",
+      "Loading weight file norm.weight\n",
+      "Loading weight file lm_head.weight\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "[0 - 7ff1680b6740]   16.224181 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7ff1680b6740]   16.321885 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7ff168092740]   16.407712 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7ff1680b6740]   16.492788 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7ff168092740]   16.563500 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7ff168092740]   16.624616 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7ff168092740]   16.675778 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   16.725625 {3}{RequestManager}: Output token is: 13272\n",
+      "[0 - 7ff168092740]   16.776205 {3}{RequestManager}: Output token is: 315\n",
+      "[0 - 7ff168092740]   16.827883 {3}{RequestManager}: Output token is: 41389\n",
+      "[0 - 7ff168092740]   16.878348 {3}{RequestManager}: Output token is: 2715\n",
+      "[0 - 7ff168092740]   16.929025 {3}{RequestManager}: Output token is: 288\n",
+      "[0 - 7ff168092740]   16.979287 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff1680b6740]   17.029879 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   17.078696 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   17.127942 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   17.177796 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   17.227023 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff1680b6740]   17.277136 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   17.328143 {3}{RequestManager}: Output token is: 64614\n",
+      "[0 - 7ff1680b6740]   17.378508 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   17.430618 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   17.482129 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   17.533479 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   17.584503 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   17.634591 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   17.685727 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   17.736768 {3}{RequestManager}: Output token is: 14535\n",
+      "[0 - 7ff168092740]   17.785909 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   17.836515 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   17.886526 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   17.936502 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   17.986222 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   18.037888 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   18.088468 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   18.138261 {3}{RequestManager}: Output token is: 25212\n",
+      "[0 - 7ff168092740]   18.187102 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   18.237270 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   18.289979 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   18.340895 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   18.391145 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   18.441155 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   18.499716 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   18.552423 {3}{RequestManager}: Output token is: 97814\n",
+      "[0 - 7ff168092740]   18.603261 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   18.654986 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   18.706227 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   18.756543 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   18.807690 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   18.857508 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   18.907649 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   18.958208 {3}{RequestManager}: Output token is: 41759\n",
+      "[0 - 7ff168092740]   19.009971 {3}{RequestManager}: Output token is: 388\n",
+      "[0 - 7ff168092740]   19.060626 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   19.112370 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   19.161425 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   19.206435 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   19.254004 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   19.306102 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   19.356853 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   19.408861 {3}{RequestManager}: Output token is: 89435\n",
+      "[0 - 7ff1680b6740]   19.460391 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff1680b6740]   19.511207 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   19.565692 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   19.617057 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   19.669739 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   19.722325 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff1680b6740]   19.773583 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   19.824646 {3}{RequestManager}: Output token is: 68550\n",
+      "[0 - 7ff1680b6740]   19.876650 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff1680b6740]   19.926939 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   19.977325 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   20.028247 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   20.078419 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   20.128614 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   20.179748 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   20.230542 {3}{RequestManager}: Output token is: 18311\n",
+      "[0 - 7ff1680b6740]   20.281634 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   20.330089 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   20.375491 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   20.422220 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   20.475078 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   20.526058 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   20.577651 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   20.628505 {3}{RequestManager}: Output token is: 7013\n",
+      "[0 - 7ff168092740]   20.681354 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   20.734160 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   20.786299 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   20.837268 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   20.888265 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   20.939708 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   20.990707 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   21.041260 {3}{RequestManager}: Output token is: 18742\n",
+      "[0 - 7ff1680b6740]   21.091386 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   21.145432 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   21.197149 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   21.249242 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   21.301514 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   21.352632 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   21.404018 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   21.455101 {3}{RequestManager}: Output token is: 56994\n",
+      "[0 - 7ff1680b6740]   21.506371 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   21.559369 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   21.611370 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   21.663655 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   21.715270 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   21.766481 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   21.818563 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   21.872108 {3}{RequestManager}: Output token is: 29505\n",
+      "[0 - 7ff168092740]   21.922670 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   21.973973 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   22.024297 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   22.076266 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   22.127594 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   22.179008 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff1680b6740]   22.230414 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   22.281805 {3}{RequestManager}: Output token is: 993\n",
+      "[0 - 7ff1680b6740]   22.282235 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n",
+      "[0 - 7ff1680b6740]   22.282243 {3}{RequestManager}: Final output: <s> <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the long neck of giraffes? Why do some animals have long tails? Why do some animals have long legs? Why do some animals have long ears? Why do some animals have long noses? Why do some animals have long whiskers? Why do some animals have long tongues? Why do some animals have long claws? Why do some animals have long teeth? Why do some animals have long hair? Why do some animals have long fur? Why do some animals have long feathers? Why do some animals have long scales? Why do some animals have long sp\n",
+      "[0 - 7ff1680b6740]   22.282250 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(15892528.0) finish(22282245.0) latency(6389717.0) ttft(15123707.0)\n",
+      "2024-07-22 06:43:05 - ###PEFT DEBUGGING### Background serving task completed.\n",
+      "Background server stopped.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json, random, subprocess, os\n",
+    "from datasets import load_dataset\n",
+    "from types import SimpleNamespace\n",
+    "from huggingface_hub import HfFolder\n",
+    "import flexflow.serve as ff\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "configs_dict = {\n",
+    "    \"num_gpus\": 1,\n",
+    "    \"memory_per_gpu\": 21000,\n",
+    "    \"zero_copy_memory_per_node\": 40000,\n",
+    "    \"num_cpus\": 4,\n",
+    "    \"legion_utility_processors\": 4,\n",
+    "    \"data_parallelism_degree\": 1,\n",
+    "    \"tensor_parallelism_degree\": 1,\n",
+    "    \"pipeline_parallelism_degree\": 1,\n",
+    "    \"offload\": False,\n",
+    "    \"offload_reserve_space_size\": 8 * 1024,  # 8GB\n",
+    "    \"use_4bit_quantization\": False,\n",
+    "    \"use_8bit_quantization\": False,\n",
+    "    \"enable_peft\": True,\n",
+    "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"profiling\": False,\n",
+    "    \"inference_debugging\": False,\n",
+    "    \"fusion\": False,\n",
+    "    \"max_requests_per_batch\": 1,\n",
+    "    \"max_sequence_length\": 128,\n",
+    "    \"max_tokens_per_batch\": 128,\n",
+    "    \"max_training_steps\": 100,\n",
+    "    \"seed\": 42,\n",
+    "}\n",
+    "model_configs = {\n",
+    "    \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n",
+    "    \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n",
+    "    \"refresh_cache\": False,\n",
+    "    \"full_precision\": False,\n",
+    "    # relative paths\n",
+    "    \"inference_dataset\": \"inference_dataset.json\",\n",
+    "    \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n",
+    "    \"output_file\": \"peft_demo.txt\",\n",
+    "}\n",
+    "generation_configs = {\n",
+    "    \"do_sample\": False,\n",
+    "    \"temperature\": 0.9,\n",
+    "    \"topp\": 0.8,\n",
+    "    \"topk\": 1,\n",
+    "}\n",
+    "finetuning_configs = {\n",
+    "    \"learning_rate\": 0.001,\n",
+    "    \"momentum\": 0.0,\n",
+    "    \"weight_decay\": 0.0,\n",
+    "    \"nesterov\": False,\n",
+    "}\n",
+    "# Merge dictionaries\n",
+    "configs_dict.update(model_configs)\n",
+    "configs_dict.update(generation_configs)\n",
+    "configs_dict.update(finetuning_configs)\n",
+    "\n",
+    "configs = SimpleNamespace(**configs_dict)\n",
+    "\n",
+    "\n",
+    "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n",
+    "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n",
+    "\n",
+    "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n",
+    "ff.init(configs_dict)\n",
+    "\n",
+    "# Create the FlexFlow LLM\n",
+    "ff_data_type = (\n",
+    "    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n",
+    ")\n",
+    "llm = ff.LLM(\n",
+    "    configs.base_model,\n",
+    "    data_type=ff_data_type,\n",
+    "    cache_path=configs.cache_path,\n",
+    "    refresh_cache=configs.refresh_cache,\n",
+    "    output_file=configs.output_file,\n",
+    ")\n",
+    "\n",
+    "lora_inference_config2 = ff.LoraLinearConfig(\n",
+    "    llm.cache_path, \n",
+    "    configs.finetuning_peft_model_id+\"-dolly\",\n",
+    "    base_model_name_or_path=configs.base_model\n",
+    ")\n",
+    "llm.add_peft(lora_inference_config2)\n",
+    "\n",
+    "\n",
+    "# Compile the LLM for inference and load the weights into memory\n",
+    "generation_config = ff.GenerationConfig(\n",
+    "    do_sample=configs.do_sample,\n",
+    "    temperature=configs.temperature,\n",
+    "    topp=configs.topp,\n",
+    "    topk=configs.topk\n",
+    ")\n",
+    "llm.compile(\n",
+    "    generation_config,\n",
+    "    max_requests_per_batch=configs.max_requests_per_batch,\n",
+    "    max_seq_length=configs.max_sequence_length,\n",
+    "    max_tokens_per_batch=configs.max_tokens_per_batch,\n",
+    ")\n",
+    "\n",
+    "llm.start_server()\n",
+    "\n",
+    "prompts = [s for s in json.load(open(configs.inference_dataset))]\n",
+    "inference_requests = [\n",
+    "    ff.Request(\n",
+    "        ff.RequestType.REQ_INFERENCE,\n",
+    "        prompt=prompt,\n",
+    "        max_sequence_length=configs.max_sequence_length,\n",
+    "        peft_model_id=llm.get_ff_peft_id(lora_inference_config2),\n",
+    "    )\n",
+    "    for prompt in prompts\n",
+    "]\n",
+    "inf_req_res_2 = llm.generate(inference_requests)\n",
+    "\n",
+    "llm.stop_server()\n",
+    "\n",
+    "with open(\"after_finetuning.txt\", \"w\") as file:\n",
+    "    file.write(str(inf_req_res_2[0].output_text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
new file mode 100644
index 0000000000..9e01b4645b
--- /dev/null
+++ b/inference/python/peft_demo/demo.py
@@ -0,0 +1,240 @@
+import json, random, subprocess
+from datasets import load_dataset
+from types import SimpleNamespace
+from huggingface_hub import HfFolder
+import os
+import flexflow.serve as ff
+import matplotlib.pyplot as plt
+
+
+def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):
+    """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.
+    Only the 'open_qa' and 'closed_qa' prompts without context are kept.
+    The datasets are saved into the files given as arguments.
+
+    Keyword arguments:
+    dataset_size -- the number of prompts to consider
+    inference_file_path -- the file in which to save the inference data
+    finetuning_file_path -- the file in which to save the finetuning data
+    """
+    dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
+    inference_data = []
+    finetuning_data = []
+    for row in dataset:
+        if len(finetuning_data) == finetune_dataset_size:
+            break
+        if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0:
+            inference_data.append(row['instruction'])
+            finetuning_data.append(row['instruction'] + " " + row['response'])
+    with open(inference_file_path, 'w') as file:
+        json.dump(inference_data[:1], file)
+    with open(finetuning_file_path, 'w') as file:
+        json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))
+
+
+configs_dict = {
+    "num_gpus": 1,
+    "memory_per_gpu": 21000,
+    "zero_copy_memory_per_node": 40000,
+    "num_cpus": 4,
+    "legion_utility_processors": 4,
+    "data_parallelism_degree": 1,
+    "tensor_parallelism_degree": 1,
+    "pipeline_parallelism_degree": 1,
+    "offload": False,
+    "offload_reserve_space_size": 8 * 1024,  # 8GB
+    "use_4bit_quantization": False,
+    "use_8bit_quantization": False,
+    "enable_peft": True,
+    "peft_activation_reserve_space_size": 1024,  # 1GB
+    "peft_weight_reserve_space_size": 1024,  # 1GB
+    "profiling": False,
+    "inference_debugging": False,
+    "fusion": False,
+    "max_requests_per_batch": 1,
+    "max_sequence_length": 128,
+    "max_tokens_per_batch": 128,
+    "max_training_steps": 100,
+    "seed": 42,
+}
+model_configs = {
+    "base_model": "meta-llama/Meta-Llama-3-8B",
+    "inference_peft_model_id": "goliaro/llama-3-8b-lora",
+    "finetuning_peft_model_id": "goliaro/llama-3-8b-lora",
+    "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+    "refresh_cache": False,
+    "full_precision": False,
+    # relative paths
+    "inference_dataset": "inference_dataset.json",
+    "finetuning_dataset": "/usr/FlexFlow/inference/prompt/peft_dataset.json",
+    "output_file": "peft_demo.txt",
+}
+generation_configs = {
+    "do_sample": False,
+    "temperature": 0.9,
+    "topp": 0.8,
+    "topk": 1,
+}
+finetuning_configs = {
+    "learning_rate": 0.001,
+    "momentum": 0.0,
+    "weight_decay": 0.0,
+    "nesterov": False,
+}
+# Merge dictionaries
+configs_dict.update(model_configs)
+configs_dict.update(generation_configs)
+configs_dict.update(finetuning_configs)
+
+
+random.seed(configs_dict["seed"])
+
+create_datasets(inference_file_path=configs_dict["inference_dataset"], 
+                finetuning_file_path=configs_dict["finetuning_dataset"])
+
+configs = SimpleNamespace(**configs_dict)
+
+# Clear output file
+with open(configs.output_file, 'w') as file:
+    file.write('')
+
+# Download base and peft inference models
+args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]
+# hf_token = input("Please enter your HuggingFace personal access token: ")
+# subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
+
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+ff.init(configs_dict)
+
+# Create the FlexFlow LLM
+ff_data_type = (
+    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+)
+llm = ff.LLM(
+    configs.base_model,
+    data_type=ff_data_type,
+    cache_path=configs.cache_path,
+    refresh_cache=configs.refresh_cache,
+    output_file=configs.output_file,
+)
+# Add inference and/or finetuning lora
+lora_inference_config = None
+lora_finetuning_config = None
+if len(configs.inference_dataset) > 0:
+    lora_inference_config = ff.LoraLinearConfig(
+        llm.cache_path, 
+        configs.inference_peft_model_id,
+        base_model_name_or_path=configs.base_model
+    )
+    llm.add_peft(lora_inference_config)
+if len(configs.finetuning_dataset) > 0:
+    lora_finetuning_config = ff.LoraLinearConfig(
+        llm.cache_path,
+        configs.finetuning_peft_model_id,
+        trainable=True,
+        init_lora_weights=False,
+        rank=16,
+        lora_alpha=16.0,
+        # target_modules = ["down_proj"],
+        base_model_name_or_path=configs.base_model,
+        optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+        optimizer_kwargs={
+            "learning_rate": configs.learning_rate,
+            "momentum": configs.momentum,
+            "weight_decay": configs.weight_decay,
+            "nesterov": configs.nesterov,
+        },
+    )
+    llm.add_peft(lora_finetuning_config)
+
+# Compile the LLM for inference and load the weights into memory
+generation_config = ff.GenerationConfig(
+    do_sample=configs.do_sample,
+    temperature=configs.temperature,
+    topp=configs.topp,
+    topk=configs.topk
+)
+enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+llm.compile(
+    generation_config,
+    enable_peft_finetuning=enable_peft_finetuning,
+    max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),
+    max_seq_length=configs.max_sequence_length,
+    max_tokens_per_batch=configs.max_tokens_per_batch,
+)
+
+
+llm.start_server()
+
+
+# prompts = [s for s in json.load(open(configs.inference_dataset))]
+# inference_requests = [
+#     ff.Request(
+#         ff.RequestType.REQ_INFERENCE,
+#         prompt=prompt,
+#         max_sequence_length=configs.max_sequence_length,
+#         peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+#     )
+#     for prompt in prompts
+# ]
+# inf_req_res_1 = llm.generate(inference_requests)
+
+
+finetuning_request = ff.Request(
+    ff.RequestType.REQ_FINETUNING,
+    max_sequence_length=configs.max_sequence_length,
+    peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
+    dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),
+    max_training_steps=configs.max_training_steps,
+)
+ft_res = llm.generate([finetuning_request])
+for res in ft_res:
+    print(res.finetuning_losses)
+
+# exit(0)
+# hf_token = input("Please enter your HuggingFace personal access token: ")
+# subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/upload_peft_model.py'] + f"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly".split())
+
+
+
+lora_inference_config = ff.LoraLinearConfig(
+    llm.cache_path, 
+    configs.finetuning_peft_model_id,
+    base_model_name_or_path=configs.base_model
+)
+llm.add_peft(lora_inference_config)
+
+args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model]
+#hf_token = input("Please enter your HuggingFace personal access token: ")
+# subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+# subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
+
+
+prompts = [s for s in json.load(open(configs.inference_dataset))]
+inference_requests = [
+    ff.Request(
+        ff.RequestType.REQ_INFERENCE,
+        prompt=prompt,
+        max_sequence_length=configs.max_sequence_length,
+        peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+    )
+    for prompt in prompts
+]
+inf_req_res_2 = llm.generate(inference_requests)
+
+
+llm.stop_server()
+
+
+print("==Inference result before finetuning: ", inf_req_res_1[0].output_text)
+print("==Inference result after finetuning: ", inf_req_res_2[0].output_text)
+
+
+epochs = list(range(configs_dict["max_training_steps"]))
+loss_values = ft_res[0].finetuning_losses
+
+plt.figure(figsize=(10, 6))
+plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')
\ No newline at end of file
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index a6dfa8042e..39529abda3 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -76,7 +79,7 @@ def get_configs():
                     "full_precision": False,
                 }
             ],
-            # "prompt": "",
+            "prompt": "",
             "output_file": "",
         }
         # Merge dictionaries
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 60233ac8d1..9689080825 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -414,15 +414,18 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
 
-    std::vector<std::string> prompts;
+    std::vector<Request> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      // Add inference request
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
-      prompts.push_back(text);
-      // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
-    tree_model.generate(prompts, 128 /*max_sequence_length*/);
+    tree_model.generate(requests);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
new file mode 100644
index 0000000000..38dd577574
--- /dev/null
+++ b/inference/utils/download_peft_model.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+import flexflow.serve as ff
+import argparse, os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_name", type=str, help="Name of the model to download"
+    )
+    parser.add_argument(
+        "peft_model_ids",
+        type=str,
+        nargs="+",
+        help="Name of the PEFT model(s) to download",
+    )
+    parser.add_argument(
+        "--cache-folder",
+        type=str,
+        help="Folder to use to store the model(s) assets in FlexFlow format",
+        default=os.environ.get("FF_CACHE_PATH", ""),
+    )
+    parser.add_argument(
+        "--refresh-cache",
+        action="store_true",
+        help="Use this flag to force the refresh of the model(s) weights/tokenizer cache",
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--full-precision-only",
+        action="store_true",
+        help="Only download the full precision version of the weights",
+    )
+    group.add_argument(
+        "--half-precision-only",
+        action="store_true",
+        help="Only download the half precision version of the weights",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    if args.full_precision_only:
+        data_types = (ff.DataType.DT_FLOAT,)
+    elif args.half_precision_only:
+        data_types = (ff.DataType.DT_HALF,)
+    else:
+        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
+
+    for data_type in data_types:
+        llm = ff.LLM(
+            args.base_model_name,
+            data_type=data_type,
+            cache_path=args.cache_folder,
+            refresh_cache=args.refresh_cache,
+        )
+        for peft_model_id in args.peft_model_ids:
+            lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id)
+            llm.add_peft(lora_config)
+        llm.download_hf_weights_if_needed()
+        llm.download_hf_config()
+        llm.download_hf_tokenizer_if_needed()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/inference/utils/upload_peft_model.py b/inference/utils/upload_peft_model.py
new file mode 100644
index 0000000000..7098d72f98
--- /dev/null
+++ b/inference/utils/upload_peft_model.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+import argparse, os
+from huggingface_hub import HfApi, HfFolder
+from transformers import AutoModelForCausalLM
+from peft import LoraConfig, PeftModel
+import torch
+import numpy as np
+import flexflow.serve as ff
+from peft import LoraConfig, get_peft_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Download a PEFT model with FlexFlow, process it, and upload it to the Hugging Face Hub."
+    )
+    parser.add_argument(
+        "--peft-model-id",
+        type=str,
+        required=True,
+        help="(Local) Hugging Face model ID of the PEFT model to upload.",
+    )
+    parser.add_argument(
+        "--upload-peft-model-id",
+        type=str,
+        required=True,
+        help="(Remote) Hugging Face model ID of the PEFT model to upload.",
+    )
+    parser.add_argument(
+        "--cache-folder",
+        type=str,
+        default=os.environ.get(
+            "FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow")
+        ),
+        help="Path to the FlexFlow cache folder",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Whether to upload the processed PEFT model as a private model on Hugging Face Hub.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Ensure Hugging Face CLI is logged in
+    if not HfFolder.get_token():
+        raise RuntimeError(
+            "Hugging Face token not found. Please login using `huggingface-cli login`."
+        )
+
+    lora_config_filepath = os.path.join(
+        args.cache_folder,
+        "finetuned_models",
+        args.peft_model_id,
+        "config",
+        "ff_config.json",
+    )
+    peft_config = ff.LoraLinearConfig.from_jsonfile(lora_config_filepath)
+    print(peft_config)
+    hf_peft_config = peft_config.to_hf_config()
+    print(hf_peft_config)
+    if peft_config.precision != "fp32" and peft_config.precision != "fp16":
+        raise ValueError(f"Unsupported precision: {peft_config.precision}")
+    model = AutoModelForCausalLM.from_pretrained(
+        peft_config.base_model_name_or_path,
+        torch_dtype=torch.float32 if peft_config.precision == "fp32" else torch.float16,
+        device_map="auto",
+    )
+    model = get_peft_model(model, hf_peft_config)
+    in_dim = model.config.intermediate_size
+    out_dim = model.config.hidden_size
+
+    weight_folder = os.path.join(
+        args.cache_folder, "finetuned_models", args.peft_model_id, "weights", "shard_0"
+    )
+    num_shards = 1
+    while os.path.exists(weight_folder.replace("shard_0", f"shard_{num_shards}")):
+        num_shards += 1
+    if not in_dim % num_shards == 0:
+        raise ValueError(
+            f"Number of shards ({num_shards}) must divide the input dimension ({in_dim})"
+        )
+    lora_weight_files = os.listdir(weight_folder)
+    for lora_file in sorted(lora_weight_files):
+        lora_filename = ".weight".join(lora_file.split(".weight")[:-1])
+        hf_parameter_name = f"base_model.model.model.{lora_filename}.default.weight"
+        if hf_parameter_name not in model.state_dict().keys():
+            raise KeyError(f"Parameter {lora_file} not found in HF model.")
+
+        ff_dtype = np.float32 if peft_config.precision == "fp32" else np.float16
+        weight_path = os.path.join(weight_folder, lora_file)
+        # LoRA_A: [in_dim, rank]
+        # LoRA_B: [rank, out_dim]
+        if "lora_A" in lora_file:
+            weight_data = []
+            for shard_id in range(num_shards):
+                weight_path_shard = weight_path.replace("shard_0", f"shard_{shard_id}")
+                weight_data_shard = np.fromfile(weight_path_shard, dtype=ff_dtype)
+                print("===in_dim:", in_dim)
+                print("===out_dim:", out_dim)
+                print("===rank:", peft_config.rank)
+                print("===num_shards:", num_shards)
+                weight_data_shard = weight_data_shard.reshape(
+                    (in_dim // num_shards, peft_config.rank), order="F"
+                )
+                weight_data.append(weight_data_shard)
+            weight_data = np.concatenate(weight_data, axis=0).T
+        elif "lora_B" in lora_file:
+            weight_data = np.fromfile(weight_path, dtype=ff_dtype)
+            weight_data = weight_data.reshape((peft_config.rank, out_dim), order="F").T
+        weight_tensor = torch.from_numpy(weight_data)
+
+        param = model.state_dict()[hf_parameter_name]
+
+        actual_numel = weight_tensor.numel()
+        expected_numel = param.numel()
+        if actual_numel != expected_numel:
+            raise ValueError(
+                f"Parameter {lora_file} has unexpected parameter count: {actual_numel} (actual) != {expected_numel} (expected)"
+            )
+
+        if weight_tensor.shape != param.shape:
+            raise ValueError(
+                f"Parameter {lora_file} has unexpected shape: {weight_tensor.shape} (actual) != {param.shape} (expected)"
+            )
+        if weight_tensor.dtype != param.dtype:
+            raise ValueError(
+                f"Parameter {lora_file} has unexpected dtype: {weight_tensor.dtype} (actual) != {param.dtype} (expected)"
+            )
+
+        with torch.no_grad():
+            param.copy_(weight_tensor)
+
+    model.push_to_hub(f"{args.upload_peft_model_id}", use_auth_token=True, private=args.private)
+
+    print("Upload process completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index 2820cf485a..b8ed15eaea 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -88,7 +88,10 @@
     "offload": "-offload",
     "offload_reserve_space_size": "-offload-reserve-space-size",
     "use_4bit_quantization": "--4bit-quantization",
-    "use_8bit_quantization": "--8bit-quantization"
+    "use_8bit_quantization": "--8bit-quantization",
+    "enable_peft": "-enable-peft",
+    "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
+    "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }
 
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 14cf4eebf7..7692ccb88f 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -28,6 +28,8 @@
     CompMode,
     MetricsType,
     InferenceMode,
+    RequestType,
+    OptimizerType,
     ModelType,
     OpType,
     ParameterSyncType,
@@ -36,6 +38,9 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
+from typing import Union, List
+from peft import LoraConfig
+import json
 
 
 def ffc():
@@ -1243,1009 +1248,935 @@ def get_weights(self, ffmodel):
 
 
 # -----------------------------------------------------------------------
-# FFModel
+# SGDOptimizer
 # -----------------------------------------------------------------------
 
 
-class FFModel(object):
-    """ """
+class SGDOptimizer(object):
+    __slots__ = ["handle", "_handle"]
 
-    __slots__ = [
-        "handle",
-        "_handle",
-        "_layers",
-        "_nb_layers",
-        "_ffconfig",
-        "_tracing_id",
-        "initializers",
-        "attr_tensors",
-    ]
+    def __init__(
+        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
+    ):
+        self.handle = ffc().flexflow_sgd_optimizer_create(
+            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
 
-    def __init__(self, ffconfig):
-        """Constructor of FFModel.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
 
-        :param ffconfig: configurations of FlexFlow and the created model.
-        :type ffconfig: FFConfig
 
-        :returns:  FFModel -- the model.
-        """
-        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
-        self._layers = dict()
-        self._nb_layers = 0
-        self._ffconfig = ffconfig
-        global ff_tracing_id
-        self._tracing_id = ff_tracing_id
-        ff_tracing_id += 1
-        self.initializers = {}
-        self.attr_tensors = {}
+# -----------------------------------------------------------------------
+# AdamOptimizer
+# -----------------------------------------------------------------------
 
-    def get_layers(self):
-        return self._layers
 
-    def add_layer(self, op_type, name):
-        layer_id = self._nb_layers
-        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
-        self._layers[self._nb_layers] = convert_op_handle_to_op(
-            op_type, op_handle, idx=layer_id, name=name
+class AdamOptimizer(object):
+    __slots__ = ["handle", "_handle"]
+
+    def __init__(
+        self,
+        ffmodel,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        weight_decay=0.0,
+        epsilon=1e-8,
+    ):
+        self.handle = ffc().flexflow_adam_optimizer_create(
+            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
         )
-        self._nb_layers += 1
+        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
 
-    def create_tensor(self, dims, data_type, create_grad=True):
-        """Instantiate a FlexFlow tensor.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
 
-        :param x: a shape tuple/list (integers), including the batch size.
-        :type x: list of int
 
-        :param data_type: the datatype of the created tensor. Options are
-          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
-        :type data_type: DataType
+# -----------------------------------------------------------------------
+# Initializer
+# -----------------------------------------------------------------------
+class Initializer(object):
+    __slots__ = ["handle", "p_handle"]
 
-        :param create_grad: weather the tensor creates a gradients vector.
-          If you don't specify anything, a gradients vector is used.
-        :type create_grad: bool
+    def __init__(self, handle, p_handle=0):
+        self.p_handle = ffi.new("flexflow_initializer_t *")
+        if handle == None:
+            self.p_handle.impl = ffi.NULL
+        else:
+            self.p_handle.impl = handle.impl
+        self.handle = self.p_handle[0]
+        assert ffi.typeof(self.handle) == ffi.typeof(
+            "flexflow_initializer_t"
+        ), "Initializer handle is wrong"
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_tensor_create(
-            self.handle, num_dims, c_dims, c_data_type, create_grad
-        )
-        return Tensor(handle)
 
-    def map_tensor(self, tensor, parallel_op=None):
-        op_handle = self.__get_op_handle(parallel_op)
-        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
+# -----------------------------------------------------------------------
+# GlorotUniform
+# -----------------------------------------------------------------------
 
-    def create_constant(self, dims, value, data_type):
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_constant_create(
-            self.handle, num_dims, c_dims, value, c_data_type
-        )
-        return Tensor(handle)
 
-    def exp(self, x, name=None):
-        """Exponential activation function.
+class GlorotUniformInitializer(Initializer):
+    __slots__ = ["glorot_handle", "_glorot_handle"]
 
-        :param x: the input Tensor.
-        :type x: Tensor
+    def __init__(self, seed):
+        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
+        self._glorot_handle = ffi.gc(
+            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        )
+        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
-        self.add_layer(OpType.EXP, name)
-        return Tensor(handle, owner_op_type=OpType.EXP)
+# -----------------------------------------------------------------------
+# ZeroInitializer
+# -----------------------------------------------------------------------
 
-    def sin(self, x, name=None):
-        """Elementwise sine function.
 
-        :param x: the input Tensor.
-        :type x: Tensor
+class ZeroInitializer(Initializer):
+    __slots__ = ["zero_handle", "_zero_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.zero_handle = ffc().flexflow_zero_initializer_create()
+        self._zero_handle = ffi.gc(
+            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        )
+        super(ZeroInitializer, self).__init__(self.zero_handle)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
-        self.add_layer(OpType.SIN, name)
-        return Tensor(handle, owner_op_type=OpType.SIN)
 
-    def cos(self, x, name=None):
-        """Elementwise cosine function.
+# -----------------------------------------------------------------------
+# UniformInitializer
+# -----------------------------------------------------------------------
 
-        :param x: the input Tensor.
-        :type x: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class UniformInitializer(Initializer):
+    __slots__ = ["uniform_handle", "_uniform_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
-        self.add_layer(OpType.COS, name)
-        return Tensor(handle, owner_op_type=OpType.COS)
+    def __init__(self, seed, minv, maxv):
+        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
+            seed, minv, maxv
+        )
+        self._uniform_handle = ffi.gc(
+            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        )
+        super(UniformInitializer, self).__init__(self.uniform_handle)
 
-    def add(self, x, y, inplace_a=False, name=None):
-        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# NormInitializer
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class NormInitializer(Initializer):
+    __slots__ = ["norm_handle", "_norm_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_add(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+    def __init__(self, seed, mean, stddev):
+        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
+        self._norm_handle = ffi.gc(
+            self.norm_handle, ffc().flexflow_norm_initializer_destroy
         )
-        self.add_layer(OpType.ADD, name)
-        return Tensor(handle, owner_op_type=OpType.ADD)
-
-    def subtract(self, x, y, inplace_a=False, name=None):
-        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
+        super(NormInitializer, self).__init__(self.norm_handle)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+# -----------------------------------------------------------------------
+# PerfMetrics
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_subtract(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.SUBTRACT, name)
-        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
+class PerfMetrics(object):
+    __slots__ = ["handle", "_handle"]
 
-    def multiply(self, x, y, inplace_a=False, name=None):
-        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
+    def __init__(self, handle):
+        self.handle = handle
+        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+    def get_accuracy(self):
+        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+# -----------------------------------------------------------------------
+# NetConfig
+# -----------------------------------------------------------------------
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_multiply(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-    def divide(self, x, y, inplace_a=False, name=None):
-        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
-
-        :param x: the first input Tensor.
-        :type x: Tensor
-
-        :param y: the second input Tensor.
-        :type y: Tensor
+class NetConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_net_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
+        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cpath)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_divide(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.DIVIDE, name)
-        return Tensor(handle, owner_op_type=OpType.DIVIDE)
+# -----------------------------------------------------------------------
+# DLRMConfig
+# -----------------------------------------------------------------------
 
-    def max(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+class DLRMConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_dlrm_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cstr)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
+        self.arch_interaction_op = ffi.string(cstr)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_max(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
+            self.handle
         )
-        self.add_layer(OpType.MAX, name)
-        return Tensor(handle, owner_op_type=OpType.MAX)
+        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
+        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
+        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
+            self.handle
+        )
+        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
 
-    def min(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
+        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
+        self.mlp_bot = []
+        for i in range(0, mlp_bot_c[0]):
+            self.mlp_bot.append(mlp_bot_c[i + 1])
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
+        self.mlp_top = []
+        for i in range(0, mlp_top_c[0]):
+            self.mlp_top.append(mlp_top_c[i + 1])
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
+        self.embedding_size = []
+        for i in range(0, embedding_size_c[0]):
+            self.embedding_size.append(embedding_size_c[i + 1])
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_min(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MIN, name)
-        return Tensor(handle, owner_op_type=OpType.MIN)
+# -----------------------------------------------------------------------
+# Single DataLoader
+# -----------------------------------------------------------------------
 
-    def reduce_sum(self, input, axes, keepdims=False, name=None):
-        """Layer that computes the sum of the input Tensor along given axes.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+class SingleDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param axes: the axes along which reduction is applied
-        :type axes: List[int]
+    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
+        assert type(input) is Tensor, "SingleDataLoader input is wrong"
+        if type(full_input) is Tensor:
+            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
+        else:
+            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create(
+            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
+        )
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_reduce_sum(
-            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
+    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
+        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create2(
+            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
         )
-        self.add_layer(OpType.REDUCE_SUM, name)
-        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def rsqrt(self, input, name=None):
-        """Layer that computes the element-wise reciprocal square-root.
+    @property
+    def num_samples(self):
+        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    @num_samples.setter
+    def num_samples(self, samples):
+        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def next_batch(self, ffmodel):
+        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
 
-        :returns:  Tensor -- the output tensor.
+        :returns:  None -- no returns.
         """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
-        self.add_layer(OpType.RSQRT, name)
-        return Tensor(handle, owner_op_type=OpType.RSQRT)
+        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
 
-    def pow(self, input, exponent, name=None):
-        """Layer that computes the element-wise power.
+    def reset(self):
+        """Reset the current position of the dataloder to 0.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_single_dataloader_reset(self.handle)
 
-        :param exponent: exponent to raise each element in the input tensor.
-        :type exponent: float
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class RegionNdarray(object):
+    __slots__ = ["__array_interface__"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_pow(
-            self.handle, input.handle, exponent, c_name
-        )
-        self.add_layer(OpType.POW, name)
-        return Tensor(handle, owner_op_type=OpType.POW)
+    def __init__(self, shape, data_type, base_ptr, strides, read_only):
+        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
+        if data_type == DataType.DT_HALF:
+            field_type = "<f2"
+        elif data_type == DataType.DT_FLOAT:
+            field_type = "<f4"
+        elif data_type == DataType.DT_INT32:
+            field_type = "<i4"
+        else:
+            assert 0, "unknown data type"
+            field_type = "<f4"
+        self.__array_interface__ = {
+            "version": 3,
+            "shape": shape,
+            "typestr": field_type,
+            "data": (base_ptr, read_only),
+            "strides": strides,
+        }
 
-    def mean(self, input, dims, keepdims=False, name=None):
-        """Layer that computes the mean of the input tensor across the given
-        dimensions.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+# -----------------------------------------------------------------------
+# BatchConfig
+# -----------------------------------------------------------------------
 
-        :param dims: dimensions to take the mean over.
-        :type dims: list
 
-        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
-                         collapses the dimension if False. Default is False.
-        :type keepdims: bool
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.handle = ffc().flexflow_batch_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        dims = list(dims)
-        c_dims = ffi.new("int[]", dims)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_mean(
-            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
-        )
-        self.add_layer(OpType.MEAN, name)
-        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-    def conv2d(
-        self,
-        input,
-        out_channels,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        activation=ActiMode.AC_MODE_NONE,
-        groups=1,
-        use_bias=True,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        name=None,
-    ):
-        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
-        to produce a tensor of :attr:`output`.
+# -----------------------------------------------------------------------
+# TreeVerifyBatchConfig
+# -----------------------------------------------------------------------
 
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        .. math::
-          C_{out} = out\_channels
+class TreeVerifyBatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          K_{H} = kernel\_h
+    def __init__(self):
+        self.handle = ffc().flexflow_tree_verify_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
+        )
 
-        .. math::
-          K_{W} = kernel\_w
 
-        .. math::
-          S_{H} = stride\_h
-
-        .. math::
-          S_{W} = stride\_w
+# -----------------------------------------------------------------------
+# BeamSearchBatchConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          P_{H} = padding\_h
 
-        .. math::
-          P_{S} = padding\_s
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+    def __init__(self):
+        self.handle = ffc().flexflow_beam_search_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_beam_search_batch_config_destroy
+        )
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
-        :param input: the input Tensor.
-        :type input: Tensor
+# -----------------------------------------------------------------------
+# RequestManager
+# -----------------------------------------------------------------------
 
-        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
-        :type out\_channels: int
 
-        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
-        :type kernel_h: int
+class RequestManager(object):
+    __slots__ = ["handle"]
 
-        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
-        :type kernel_w: int
+    def __init__(self):
+        self.handle = ffc().flexflow_request_manager_get_request_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
 
-        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
-        :type stride_h: int
+    def register_tokenizer(
+        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
+    ):
+        c_model_type = enum_to_int(ModelType, model_type)
+        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
+        return ffc().flexflow_request_manager_register_tokenizer(
+            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+        )
 
-        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
-        :type stride_w: int
+    def register_output_filepath(self, output_filepath):
+        c_output_filepath = get_c_name(output_filepath)
+        return ffc().flexflow_request_manager_register_output_filepath(
+            self.handle, c_output_filepath
+        )
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    def register_ssm_model(self, model):
+        return ffc().flexflow_request_manager_register_ssm_model(
+            self.handle, model.handle
+        )
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+    def set_max_requests_per_batch(self, max_requests):
+        return ffc().flexflow_request_manager_set_max_requests_per_batch(
+            self.handle, max_requests
+        )
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+    def set_max_tokens_per_batch(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
+            self.handle, max_tokens
+        )
 
-        :param groups: the number of groups in this convolution
-        :type groups: int
+    def set_max_spec_tree_token_num(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
+            self.handle, max_tokens
+        )
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+    def set_max_sequence_length(self, max_length):
+        return ffc().flexflow_request_manager_set_max_sequence_length(
+            self.handle, max_length
+        )
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+    def set_enable_peft_finetuning(self, enable_peft_finetuning):
+        return ffc().flexflow_request_manager_set_enable_peft_finetuning(
+            self.handle, enable_peft_finetuning
+        )
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def start_server(self, model):
+        return ffc().flexflow_request_manager_start_background_server(
+            self.handle, model.handle
+        )
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+    def stop_server(self):
+        return ffc().flexflow_request_manager_terminate_background_server(self.handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_conv2d(
-            self.handle,
-            input.handle,
-            out_channels,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_activation,
-            groups,
-            use_bias,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_name,
-        )
-        self.add_layer(OpType.CONV2D, name)
-        return Tensor(handle, owner_op_type=OpType.CONV2D)
+# -----------------------------------------------------------------------
+# InferenceManager
+# -----------------------------------------------------------------------
 
-    def embedding(
-        self,
-        input,
-        num_embeddings,
-        embedding_dim,
-        aggr,
-        dtype=DataType.DT_FLOAT,
-        shared_op=None,
-        kernel_initializer=None,
-        name=None,
-    ):
-        """Layer that turns positive integers into dense vectors of fixed size
 
-        :param input: the input Tensor.
-        :type input: Tensor
+class InferenceManager(object):
+    __slots__ = ["handle"]
 
-        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
-        :type num_embeddings: int
+    def __init__(self):
+        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
 
-        :param embedding_dim: dimension of the dense embedding.
-        :type embedding_dim: int
+    def compile_model_and_allocate_buffer(self, model):
+        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
+            self.handle, model.handle
+        )
 
-        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
-        :type aggr: AggrMode
+    def init_operators_inference(self, model):
+        ffc().flexflow_inference_manager_init_operators_inference(
+            self.handle, model.handle
+        )
 
-        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
-        :type dtype: DataType
+    def register_model_weights_loader(self, model, fileloader):
+        ffc().flexflow_inference_manager_register_model_weights_loader(
+            self.handle, model.handle, fileloader.handle
+        )
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+# -----------------------------------------------------------------------
+# FileDataLoader
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_aggr = enum_to_int(AggrMode, aggr)
-        c_dtype = enum_to_int(DataType, dtype)
-        if kernel_initializer is None:
-            kernel_initializer = GlorotUniformInitializer(42)
-        assert (
-            (type(kernel_initializer) is GlorotUniformInitializer)
-            or (type(kernel_initializer) is ZeroInitializer)
-            or (type(kernel_initializer) is UniformInitializer)
-            or (type(kernel_initializer) is NormInitializer)
-        ), f"Unknown initializer type: {kernel_initializer}"
-        handle = ffc().flexflow_model_add_embedding(
-            self.handle,
-            input.handle,
-            num_embeddings,
-            embedding_dim,
-            c_aggr,
-            c_dtype,
-            shared_op_handle,
-            kernel_initializer.handle,
-            c_name,
-        )
-        # NOTE: We must keep a reference to the initializer or else it will be
-        # immediately destructed
-        self.initializers[name] = kernel_initializer
-        self.add_layer(OpType.EMBEDDING, name)
-        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
+class FileDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-    def pool2d(
+    def __init__(
         self,
-        input,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        pool_type=PoolType.POOL_MAX,
-        activation=ActiMode.AC_MODE_NONE,
-        name=None,
+        weight_file_path,
+        num_q_heads,
+        num_kv_heads,
+        hidden_dim,
+        qkv_inner_dim,
+        tensor_parallelism_degree,
+        use_full_precision,
     ):
-        """Pooling operation for 2D spatial data.
+        c_weight_file_path = get_c_name(weight_file_path)
+        self.handle = ffc().flexflow_file_data_loader_create(
+            c_weight_file_path,
+            num_q_heads,
+            num_kv_heads,
+            hidden_dim,
+            qkv_inner_dim,
+            tensor_parallelism_degree,
+            use_full_precision,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
 
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
+    def load_weights(self, model):
+        # Check data type and create use_full_precision boolean
+        # assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
+        # use_full_precision = data_type == DataType.DT_FLOAT
+        ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle)
 
-        .. math::
-          C_{out} = out\_channels
 
-        .. math::
-          K_{H} = kernel\_h
+# -----------------------------------------------------------------------
+# GenerationConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          K_{W} = kernel\_w
 
-        .. math::
-          S_{H} = stride\_h
+class GenerationConfig(object):
+    """A class to store the sampling configs."""
 
-        .. math::
-          S_{W} = stride\_w
+    def __init__(
+        self,
+        do_sample: bool = False,
+        temperature: float = 0.9,
+        topp: float = 0.8,
+        topk: int = 1,
+    ):
+        """Initialize the sampling configs
+
+        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
+        :type do_sample: bool, optional
+        :param temperature: The temperature setting, defaults to 0.9
+        :type temperature: float, optional
+        :param topp: The top probabilities (top-p) setting, defaults to 0.8
+        :type topp: float, optional
+        :param topk: The top-k setting, defaults to 1
+        :type topk: int, optional
+        """
+        self.do_sample = do_sample
+        self.temperature = temperature
+        self.topp = topp
+        self.topk = topk
 
-        .. math::
-          P_{H} = padding\_h
 
-        .. math::
-          P_{S} = padding\_s
+# -----------------------------------------------------------------------
+# GenerationResult
+# -----------------------------------------------------------------------
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
+class GenerationResult(object):
+    """A class to store the output of a generation request."""
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def __init__(
+        self, text: str = None, tokens: list = None, finetuning_losses: list = []
+    ):
+        self.output_text = text
+        self.output_tokens = tokens
+        self.finetuning_losses = finetuning_losses
 
-        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
-        :type kernel_h: int
 
-        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
-        :type kernel_w: int
+# -----------------------------------------------------------------------
+# LoraLinearConfig
+# -----------------------------------------------------------------------
 
-        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
-        :type stride_h: int
 
-        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
-        :type stride_w: int
+class LoraLinearConfig(object):
+    def __init__(
+        self,
+        cache_folder: str,
+        peft_model_id: str,
+        trainable: bool = False,
+        init_lora_weights: bool = False,
+        base_model_name_or_path: str = "",
+        precision: str = "fp16",
+        rank: int = None,
+        lora_alpha: float = None,
+        lora_dropout: float = None,
+        target_modules: List[str] = [],
+        optimizer_type: OptimizerType = OptimizerType.OPTIMIZER_TYPE_NONE,
+        optimizer_kwargs: dict = {},
+    ):
+        if trainable:
+            if (
+                optimizer_type != OptimizerType.OPTIMIZER_TYPE_SGD
+                and optimizer_type != OptimizerType.OPTIMIZER_TYPE_ADAM
+            ):
+                raise ValueError(
+                    "Please specify optimizer to be used to train LoRA module. Supported optimizers: SGD and Adam"
+                )
+            if init_lora_weights and len(target_modules) == 0:
+                raise ValueError(
+                    "Please specify target modules to be used to train LoRA module"
+                )
+            if not init_lora_weights and len(target_modules) > 0:
+                raise ValueError(
+                    "Target modules can only be specified when init_lora_weights=True"
+                )
+        else:
+            if init_lora_weights:
+                raise ValueError(
+                    "LORA weights initialization from scratch not supported in inference model"
+                )
+            if len(target_modules) > 0:
+                raise ValueError(
+                    "Target modules can only be specified when trainable=True"
+                )
+        
+        # Check rank, lora_alpha, lora_dropout values
+        if rank is not None or lora_alpha is not None or lora_dropout is not None:
+            if not trainable or not init_lora_weights:
+                raise ValueError(
+                    "rank, lora_alpha, and lora_dropout can only be set when trainable=True and init_lora_weights=True"
+                )
+        rank = rank if rank is not None else 8
+        lora_alpha = lora_alpha if lora_alpha is not None else 8.0
+        lora_dropout = lora_dropout if lora_dropout is not None else 0.0
+        
+        # If passed, check if the values of rank, lora_alpha, and lora_dropout are valid
+        if rank < 1 or type(rank) != int:
+            raise ValueError("Rank must be >= 1 and an integer")
+        if lora_alpha <= 0:
+            raise ValueError("Lora_alpha must be > 0")
+        if lora_dropout < 0 or lora_dropout > 1:
+            raise ValueError("Lora_dropout must be in the interval [0, 1]")
+        
+        self.ff_initialized = False
+        self._cache_folder = cache_folder
+        self._peft_model_id = peft_model_id
+        self._trainable = trainable
+        self._init_lora_weights = init_lora_weights
+        self._base_model_name_or_path = base_model_name_or_path
+        self._precision = precision
+        self._rank = rank
+        self._lora_alpha = lora_alpha
+        self._lora_dropout = lora_dropout
+        self._target_modules = target_modules
+        self.optimizer_type = optimizer_type
+        self.optimizer_kwargs = optimizer_kwargs
+
+    def ff_compile(self):
+        c_cache_folder = get_c_name(os.path.expanduser(self.cache_folder))
+        peft_model_id = get_c_name(self.peft_model_id)
+        base_model_name_or_path = get_c_name(self.base_model_name_or_path)
+        precision = get_c_name(self.precision)
+        c_target_modules = [
+            get_c_name(target_module) for target_module in self.target_modules
+        ]
+        c_optimizer_type = enum_to_int(OptimizerType, self.optimizer_type)
+        # SGD optional optimizer args
+        sgd_learning_rate = self.optimizer_kwargs.get("learning_rate", 0.001)
+        sgd_momentum = self.optimizer_kwargs.get("momentum", 0.0)
+        sgd_nesterov = self.optimizer_kwargs.get("nesterov", False)
+        sgd_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0)
+        # Adam optional optimizer args
+        adam_alpha = self.optimizer_kwargs.get("alpha", 0.001)
+        adam_beta1 = self.optimizer_kwargs.get("beta1", 0.9)
+        adam_beta2 = self.optimizer_kwargs.get("beta2", 0.999)
+        adam_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0)
+        adam_epsilon = self.optimizer_kwargs.get("epsilon", 1e-8)
+        self.handle = ffc().flexflow_lora_linear_config_create(
+            c_cache_folder,
+            peft_model_id,
+            self.trainable,
+            self.init_lora_weights,
+            base_model_name_or_path,
+            precision,
+            self.rank,
+            self.lora_alpha,
+            self.lora_dropout,
+            len(self.target_modules),
+            c_target_modules,
+            c_optimizer_type,
+            sgd_learning_rate,
+            sgd_momentum,
+            sgd_nesterov,
+            sgd_weight_decay,
+            adam_alpha,
+            adam_beta1,
+            adam_beta2,
+            adam_weight_decay,
+            adam_epsilon,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy)
+        self.ff_initialized = True
+
+    @classmethod
+    def from_jsonfile(self, jsonfile: str):
+        with open(jsonfile, "r") as file:
+            config = json.load(file)
+        config_dict = dict(config)
+        config_dict["optimizer_type"] = OptimizerType.OPTIMIZER_TYPE_SGD
+        return LoraLinearConfig(**config_dict)
+
+    def to_hf_config(self) -> LoraConfig:
+        return LoraConfig(
+            base_model_name_or_path=self.base_model_name_or_path,
+            r=self.rank,
+            target_modules=self.target_modules,
+            lora_alpha=self.lora_alpha,
+            lora_dropout=self.lora_dropout,
+        )
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    @property
+    def cache_folder(self):
+        if self.ff_initialized:
+            c_cache_folder = ffc().flexflow_lora_linear_config_get_cache_folder(
+                self.handle
+            )
+            return ffi.string(c_cache_folder).decode("utf-8")
+        else:
+            return self._cache_folder
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+    @property
+    def peft_model_id(self):
+        if self.ff_initialized:
+            c_peft_model_id = ffc().flexflow_lora_linear_config_get_peft_model_id(
+                self.handle
+            )
+            return ffi.string(c_peft_model_id).decode("utf-8")
+        else:
+            return self._peft_model_id
 
-        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
-        :type activation: PoolType
+    @property
+    def rank(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_rank(self.handle)
+        else:
+            return self._rank
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+    @property
+    def lora_alpha(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_lora_alpha(self.handle)
+        else:
+            return self._lora_alpha
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    @property
+    def lora_dropout(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_lora_dropout(self.handle)
+        else:
+            return self._lora_dropout
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_pool_type = enum_to_int(PoolType, pool_type)
-        c_activation = enum_to_int(ActiMode, activation)
-        handle = ffc().flexflow_model_add_pool2d(
-            self.handle,
-            input.handle,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_pool_type,
-            c_activation,
-            c_name,
-        )
-        self.add_layer(OpType.POOL2D, name)
-        return Tensor(handle, owner_op_type=OpType.POOL2D)
+    @property
+    def trainable(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_trainable(self.handle)
+        else:
+            return self._trainable
 
-    def batch_norm(self, input, relu=True, name=None):
-        """Layer that normalizes its inputs.
+    @property
+    def init_lora_weights(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_init_lora_weights(self.handle)
+        else:
+            return self._init_lora_weights
 
-        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
+    @property
+    def base_model_name_or_path(self):
+        if self.ff_initialized:
+            c_base_model_name_or_path = (
+                ffc().flexflow_lora_linear_config_get_base_model_name_or_path(
+                    self.handle
+                )
+            )
+            return ffi.string(c_base_model_name_or_path).decode("utf-8")
+        else:
+            return self._base_model_name_or_path
 
-        :param input: the list of input Tensors.
-        :type input: Tensor
+    @property
+    def precision(self):
+        if self.ff_initialized:
+            c_precision = ffc().flexflow_lora_linear_config_get_precision(self.handle)
+            return ffi.string(c_precision).decode("utf-8")
+        else:
+            return self._precision
 
-        :param relu: whether a ReLU function is applied. Default is True.
-        :type relu: bool
+    @property
+    def target_modules(self):
+        if self.ff_initialized:
+            num_target_modules = ffi.new("int *")
+            c_target_modules = ffc().flexflow_lora_linear_config_get_target_modules(
+                self.handle, num_target_modules
+            )
+            target_modules = []
+            for i in range(num_target_modules[0]):
+                target_modules.append(ffi.string(c_target_modules[i]).decode("utf-8"))
+            return target_modules
+        else:
+            return self._target_modules
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    @cache_folder.setter
+    def cache_folder(self, value: str):
+        self._cache_folder = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_cache_folder(self.handle, value)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_batch_norm(
-            self.handle, input.handle, relu, c_name
-        )
-        self.add_layer(OpType.BATCH_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
+    @peft_model_id.setter
+    def peft_model_id(self, value: str):
+        self._peft_model_id = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_peft_model_id(self.handle, value)
 
-    def layer_norm(
-        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
-    ):
-        """Add a LayerNorm layer
+    @rank.setter
+    def rank(self, value: int):
+        self._rank = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_rank(self.handle, value)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: The LayerNorm output tensor
-        :rtype: Tensor
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_layer_norm(
-            self.handle,
-            input.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.LAYER_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
+    @lora_alpha.setter
+    def lora_alpha(self, value: float):
+        self._lora_alpha = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_lora_alpha(self.handle, value)
 
-    def residual_layer_norm(
-        self,
-        input,
-        residual1,
-        residual2,
-        use_two_residuals,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        name=None,
-    ):
-        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in 
-        better efficiency compared to using separate element-wise add and LayerNorm operators.
+    @lora_dropout.setter
+    def lora_dropout(self, value: float):
+        self._lora_dropout = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_lora_dropout(self.handle, value)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual1: The residual tensor to add to the input before computing the LayerNorm
-        :type residual1: Tensor
-        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
-        :type residual2: Tensor
-        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
-        :type use_two_residuals: bool
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: List[int]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: str, optional
-        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        residual2_handle = (
-            residual1.handle
-        )  # This is intentional. Data will be ignored, and we cannot pass None
-        if use_two_residuals:
-            assert residual2 is not None
-            residual2_handle = residual2.handle
-        handles_array = ffc().flexflow_model_add_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual1.handle,
-            residual2_handle,
-            use_two_residuals,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM)
+    @trainable.setter
+    def trainable(self, value: bool):
+        self._trainable = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_trainable(self.handle, value)
 
-    def add_bias_residual_layer_norm(
-        self,
-        input,
-        residual,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        name=None,
-    ):
-        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, 
-        resulting in better efficiency compared to using separate attention bias addition + 
-        element-wise residual addition + LayerNorm operators.
+    @init_lora_weights.setter
+    def init_lora_weights(self, value: bool):
+        self._init_lora_weights = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_init_lora_weights(self.handle, value)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual: The residual tensor
-        :type residual: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
 
-    def sigmoid_silu_multi(self, input1, input2, name=None):
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
-            self.handle, input1.handle, input2.handle, c_name
-        )
-        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
+# -----------------------------------------------------------------------
+# PEFTModelID
+# -----------------------------------------------------------------------
 
-    def batch_matmul(
-        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
-    ):
-        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
 
-        :param A: the first input Tensor.
-        :type A: Tensor
+class PEFTModelID(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param B: the second input Tensor.
-        :type B: Tensor
+    __no_id_h = None
 
-        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
-        :type a_seq_length_dim: int
+    def __init__(self, id=None):
+        if id is None:
+            self.handle = ffc().flexflow_peft_model_id_create()
+        else:
+            self.handle = ffc().flexflow_peft_model_id_create_id(id)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
 
-        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
-        :type b_seq_length_dim: int
+    @staticmethod
+    def no_id_handle():
+        if PEFTModelID.__no_id_h is None:
+            PEFTModelID.__no_id_h = ffc().flexflow_peft_model_id_no_id()
+        return PEFTModelID.__no_id_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :param name:  Whether to add use bias in layer normalization
-        :type name: bool
+# -----------------------------------------------------------------------
+# Request
+# -----------------------------------------------------------------------
 
-        :returns:  Tensor -- the output tensor.
-        """
-        if a_seq_length_dim is None:
-            a_seq_length_dim = -1
-        if b_seq_length_dim is None:
-            b_seq_length_dim = -1
-        handle = ffc().flexflow_model_add_batch_matmul(
-            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
-        )
-        self.add_layer(OpType.BATCH_MATMUL, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
 
-    def dense(
+class Request:
+    """A class to record the metadata of an inference or finetuning request."""
+
+    def __init__(
         self,
-        input,
-        out_dim,
-        activation=ActiMode.AC_MODE_NONE,
-        use_bias=True,
-        datatype=DataType.DT_NONE,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        kernel_regularizer=None,
-        name=None,
+        req_type: RequestType,
+        prompt: str = None,
+        max_sequence_length: int = 128,
+        peft_model_id: PEFTModelID = None,
+        dataset_filepath: str = None,
+        max_training_steps: int = 1,
     ):
-        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
-        :attr:`activation` is the element-wise activation function passed as the activation argument,
-        :attr:`kernel` is a weights matrix created by the layer, and
-        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
+        self.req_type = req_type
+        self.prompt = prompt
+        self.max_sequence_length = max_sequence_length
+        self.peft_model_id = peft_model_id
+        self.dataset_filepath = dataset_filepath
+        self.max_training_steps = max_training_steps
 
-        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
-        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
-
-        :param input: the input Tensor.
-        :type input: Tensor
 
-        :param out\_dim: dimensionality of the output space.
-        :type out\_dim: int
+# -----------------------------------------------------------------------
+# FFModel
+# -----------------------------------------------------------------------
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+class FFModel(object):
+    """ """
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+    __slots__ = [
+        "handle",
+        "_handle",
+        "_layers",
+        "_nb_layers",
+        "_ffconfig",
+        "_tracing_id",
+        "initializers",
+        "attr_tensors",
+    ]
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def __init__(self, ffconfig):
+        """Constructor of FFModel.
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+        :param ffconfig: configurations of FlexFlow and the created model.
+        :type ffconfig: FFConfig
 
-        :param kernel_regularizer: Regularizer for the kernel weights matrix
-        :type bias_initializer: Regularizer
+        :returns:  FFModel -- the model.
+        """
+        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
+        self._layers = dict()
+        self._nb_layers = 0
+        self._ffconfig = ffconfig
+        global ff_tracing_id
+        self._tracing_id = ff_tracing_id
+        ff_tracing_id += 1
+        self.initializers = {}
+        self.attr_tensors = {}
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def get_layers(self):
+        return self._layers
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        c_datatype = enum_to_int(DataType, datatype)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        if kernel_regularizer:
-            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
-            kernel_reg_lambda = kernel_regularizer._lambda
-        else:
-            c_kernel_reg_type = enum_to_int(
-                RegularizerMode, RegularizerMode.REG_MODE_NONE
-            )
-            kernel_reg_lambda = 0.0
-        handle = ffc().flexflow_model_add_dense(
-            self.handle,
-            input.handle,
-            out_dim,
-            c_activation,
-            use_bias,
-            c_datatype,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_kernel_reg_type,
-            kernel_reg_lambda,
-            c_name,
+    def add_layer(self, op_type, name):
+        layer_id = self._nb_layers
+        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
+        self._layers[self._nb_layers] = convert_op_handle_to_op(
+            op_type, op_handle, idx=layer_id, name=name
         )
-        self.add_layer(OpType.LINEAR, name)
-        return Tensor(handle, owner_op_type=OpType.LINEAR)
-
-    def concat(self, tensors, axis, name=None):
-        """Layer that concatenates a list of inputs.
+        self._nb_layers += 1
 
-        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
+    def create_tensor(self, dims, data_type, create_grad=True):
+        """Instantiate a FlexFlow tensor.
 
-        :param input: the list of input Tensors.
-        :type input: List of Tensors
+        :param x: a shape tuple/list (integers), including the batch size.
+        :type x: list of int
 
-        :param axis: the dimension along which to concatenate.
-        :type axis: int
+        :param data_type: the datatype of the created tensor. Options are
+          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
+        :type data_type: DataType
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param create_grad: weather the tensor creates a gradients vector.
+          If you don't specify anything, a gradients vector is used.
+        :type create_grad: bool
 
         :returns:  Tensor -- the output tensor.
         """
-        assert type(tensors) is list, "tensors should be a list"
-        tensor_handle_list = []
-        n = len(tensors)
-        assert n <= 256, "Please increase MAX_NUM_INPUTS"
-        for tensor in tensors:
-            tensor_handle_list.append(tensor.handle)
-        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_concat(
-            self.handle, n, c_tensor_handle_list, axis, c_name
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_tensor_create(
+            self.handle, num_dims, c_dims, c_data_type, create_grad
         )
-        self.add_layer(OpType.CONCAT, name)
-        return Tensor(handle, owner_op_type=OpType.CONCAT)
+        return Tensor(handle)
 
-    def split(self, input, sizes, axis, name=None):
-        """Layer that splits a :attr:`input` tensor into a list of tensors.
+    def map_tensor(self, tensor, parallel_op=None):
+        op_handle = self.__get_op_handle(parallel_op)
+        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def create_constant(self, dims, value, data_type):
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_constant_create(
+            self.handle, num_dims, c_dims, value, c_data_type
+        )
+        return Tensor(handle)
 
-        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
-        :type sizes: int or list of int
+    def exp(self, x, name=None):
+        """Exponential activation function.
 
-        :param axis: the dimension along which to split.
-        :type axis: int
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns:  list of Tensors -- the output tensors.
+        :returns:  Tensor -- the output tensor.
         """
-        if type(sizes) is list:
-            split = sizes
-        else:
-            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
-            split = [input.dims[axis] // sizes for i in range(sizes)]
-        n = len(split)
-        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
-        c_split = ffi.new("int[]", split)
-        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
         c_name = get_c_name(name)
-        ffc().flexflow_model_add_split(
-            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
-        )
-        output_tensor_list = []
-        for i in range(n):
-            tensor_p_handle = ffi.new("flexflow_tensor_t*")
-            tensor_p_handle.impl = c_outputs_handle_list[i].impl
-            output_tensor_list.append(
-                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
-            )
-        self.add_layer(OpType.SPLIT, name)
-        del c_outputs_handle_list
-        return output_tensor_list
+        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
+        self.add_layer(OpType.EXP, name)
+        return Tensor(handle, owner_op_type=OpType.EXP)
 
-    def flat(self, input, name=None):
-        """Flattens the input. Does not affect the batch size.
+    def sin(self, x, name=None):
+        """Elementwise sine function.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2253,15 +2184,15 @@ def flat(self, input, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
-        self.add_layer(OpType.FLAT, name)
-        return Tensor(handle, owner_op_type=OpType.FLAT)
+        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
+        self.add_layer(OpType.SIN, name)
+        return Tensor(handle, owner_op_type=OpType.SIN)
 
-    def softmax(self, input, axis=-1, name=None):
-        """Softmax activation function.
+    def cos(self, x, name=None):
+        """Elementwise cosine function.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2269,23 +2200,18 @@ def softmax(self, input, axis=-1, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_softmax(
-            self.handle, input.handle, axis, c_name
-        )
-        self.add_layer(OpType.SOFTMAX, name)
-        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
-
-    def reshape(self, input, shape, name=None):
-        """Layer that reshapes inputs into the given shape.
+        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
+        self.add_layer(OpType.COS, name)
+        return Tensor(handle, owner_op_type=OpType.COS)
 
-        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
-        except with a new shape given by :attr:`shape`.
+    def add(self, x, y, inplace_a=False, name=None):
+        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param shape: A list defining the shape of the output tensor.
-        :type shape: list of int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2293,45 +2219,41 @@ def reshape(self, input, shape, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_shape = ffi.new("int[]", shape)
-        handle = ffc().flexflow_model_add_reshape(
-            self.handle, input.handle, len(shape), c_shape, c_name
+        handle = ffc().flexflow_model_add_add(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.RESHAPE, name)
-        return Tensor(handle, owner_op_type=OpType.RESHAPE)
-
-    def gather(self, input, index, dim, name=None):
-        """Layer that gathers values along the dim axis.
+        self.add_layer(OpType.ADD, name)
+        return Tensor(handle, owner_op_type=OpType.ADD)
 
-        :param input: the input tensor
-        :type input: Tensor
+    def subtract(self, x, y, inplace_a=False, name=None):
+        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
 
-        :param index: the index tensor, which specifies the indices of elements to gather
-        :type index: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param dim: the axis along which to index
-        :type dim: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
-        :param name: the name of the layer. Default is None
+        :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns: Tensor -- the output tensor
+        :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gather(
-            self.handle, input.handle, index.handle, dim, c_name
+        handle = ffc().flexflow_model_add_subtract(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.GATHER, name)
-        return Tensor(handle, owner_op_type=OpType.GATHER)
+        self.add_layer(OpType.SUBTRACT, name)
+        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
 
-    def transpose(self, input, perm, name=None):
-        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
+    def multiply(self, x, y, inplace_a=False, name=None):
+        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param perm: A permutation of the dimensions of a.
-        :type perm: List of int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2339,23 +2261,20 @@ def transpose(self, input, perm, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_perm = ffi.new("int[]", perm)
-        handle = ffc().flexflow_model_add_transpose(
-            self.handle, input.handle, len(perm), c_perm, c_name
+        handle = ffc().flexflow_model_add_multiply(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.TRANSPOSE, name)
-        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
-
-    def reverse(self, input, axis, name=None):
-        """Layer that reverses specific dimensions of a tensor.
+        self.add_layer(OpType.MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
+    def divide(self, x, y, inplace_a=False, name=None):
+        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param axis: the dimension to reverse.
-        :type axis: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2363,20 +2282,20 @@ def reverse(self, input, axis, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_reverse(
-            self.handle, input.handle, axis, c_name
+        handle = ffc().flexflow_model_add_divide(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.REVERSE, name)
-        return Tensor(handle, owner_op_type=OpType.REVERSE)
+        self.add_layer(OpType.DIVIDE, name)
+        return Tensor(handle, owner_op_type=OpType.DIVIDE)
 
-    def scalar_multiply(self, input, scalar, inplace=True, name=None):
-        """Scalar multiplication of a tensor by an scalar.
+    def max(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2384,20 +2303,20 @@ def scalar_multiply(self, input, scalar, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_multiply(
-            self.handle, input.handle, scalar, inplace, c_name
+        handle = ffc().flexflow_model_add_max(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.SCALAR_MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
+        self.add_layer(OpType.MAX, name)
+        return Tensor(handle, owner_op_type=OpType.MAX)
 
-    def scalar_add(self, input, scalar, inplace=True, name=None):
-        """Scalar addition of a scalar to each entry of a tensor.
+    def min(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2405,20 +2324,20 @@ def scalar_add(self, input, scalar, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_add(
-            self.handle, input.handle, scalar, inplace, c_name
+        handle = ffc().flexflow_model_add_min(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.SCALAR_ADD, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
+        self.add_layer(OpType.MIN, name)
+        return Tensor(handle, owner_op_type=OpType.MIN)
 
-    def scalar_sub(self, input, scalar, inplace=True, name=None):
-        """Scalar subtraction of a scalar to each entry of a tensor.
+    def reduce_sum(self, input, axes, keepdims=False, name=None):
+        """Layer that computes the sum of the input Tensor along given axes.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param axes: the axes along which reduction is applied
+        :type axes: List[int]
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2426,215 +2345,234 @@ def scalar_sub(self, input, scalar, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_sub(
-            self.handle, input.handle, scalar, inplace, c_name
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_reduce_sum(
+            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
         )
-        self.add_layer(OpType.SCALAR_SUB, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
+        self.add_layer(OpType.REDUCE_SUM, name)
+        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
-        """Scalar regular division of a tensor by an scalar.
+    def rsqrt(self, input, name=None):
+        """Layer that computes the element-wise reciprocal square-root.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_truediv(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_TRUEDIV, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
+        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
+        self.add_layer(OpType.RSQRT, name)
+        return Tensor(handle, owner_op_type=OpType.RSQRT)
 
-    def gelu(self, input, inplace=True, name=None):
-        """Gaussian Error Linear Unit activation function.
+    def pow(self, input, exponent, name=None):
+        """Layer that computes the element-wise power.
 
         :param input: the input Tensor.
         :type input: Tensor
 
+        :param exponent: exponent to raise each element in the input tensor.
+        :type exponent: float
+
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
-        self.add_layer(OpType.GELU, name)
-        return Tensor(handle, owner_op_type=OpType.GELU)
+        handle = ffc().flexflow_model_add_pow(
+            self.handle, input.handle, exponent, c_name
+        )
+        self.add_layer(OpType.POW, name)
+        return Tensor(handle, owner_op_type=OpType.POW)
 
-    def relu(self, input, inplace=True, name=None):
-        """Rectified Linear Unit activation function.
+    def mean(self, input, dims, keepdims=False, name=None):
+        """Layer that computes the mean of the input tensor across the given
+        dimensions.
 
         :param input: the input Tensor.
         :type input: Tensor
 
+        :param dims: dimensions to take the mean over.
+        :type dims: list
+
+        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
+                         collapses the dimension if False. Default is False.
+        :type keepdims: bool
+
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        dims = list(dims)
+        c_dims = ffi.new("int[]", dims)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_relu(
-            self.handle, input.handle, inplace, c_name
+        handle = ffc().flexflow_model_add_mean(
+            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
         )
-        self.add_layer(OpType.RELU, name)
-        return Tensor(handle, owner_op_type=OpType.RELU)
+        self.add_layer(OpType.MEAN, name)
+        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-    def identity(self, input, name=None):
-        """Identity function.
+    def conv2d(
+        self,
+        input,
+        out_channels,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        activation=ActiMode.AC_MODE_NONE,
+        groups=1,
+        use_bias=True,
+        shared_op=None,
+        kernel_initializer=None,
+        bias_initializer=None,
+        name=None,
+    ):
+        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
+        to produce a tensor of :attr:`output`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          C_{out} = out\_channels
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
-        self.add_layer(OpType.IDENTITY, name)
-        return Tensor(handle, owner_op_type=OpType.IDENTITY)
+        .. math::
+          K_{H} = kernel\_h
 
-    def sigmoid(self, input, name=None):
-        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
+        .. math::
+          K_{W} = kernel\_w
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        .. math::
+          S_{H} = stride\_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          S_{W} = stride\_w
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
-        self.add_layer(OpType.SIGMOID, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID)
+        .. math::
+          P_{H} = padding\_h
 
-    def tanh(self, input, name=None):
-        """Hyperbolic tangent activation function.
+        .. math::
+          P_{S} = padding\_s
+
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
+        :type out\_channels: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
-        self.add_layer(OpType.TANH, name)
-        return Tensor(handle, owner_op_type=OpType.TANH)
+        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
+        :type kernel_h: int
 
-    def elu(self, input, inplace=True, name=None):
-        """Exponential Linear Unit. activation function.
+        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
+        :type kernel_w: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_elu(
-            self.handle, input.handle, inplace, c_name
-        )
-        self.add_layer(OpType.ELU, name)
-        return Tensor(handle, owner_op_type=OpType.ELU)
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
 
-    def dropout(self, input, rate, seed, name=None):
-        """The Dropout layer randomly sets input units to 0 with
-        a frequency of :attr:`rate` at each step during training time,
-        which helps prevent overfitting.
-        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
-        sum over all inputs is unchanged.
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-        :param rate: Fraction of the input units to drop.
-        :type rate: float(0-1)
+        :param groups: the number of groups in this convolution
+        :type groups: int
 
-        :param seed: random seed.
-        :type seed: int
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
+
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
+
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_dropout(
-            self.handle, input.handle, rate, seed, c_name
+        handle = ffc().flexflow_model_add_conv2d(
+            self.handle,
+            input.handle,
+            out_channels,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_activation,
+            groups,
+            use_bias,
+            shared_op_handle,
+            kernel_init_handle,
+            bias_init_handle,
+            c_name,
         )
-        self.add_layer(OpType.DROPOUT, name)
-        return Tensor(handle, owner_op_type=OpType.DROPOUT)
+        self.add_layer(OpType.CONV2D, name)
+        return Tensor(handle, owner_op_type=OpType.CONV2D)
 
-    def multihead_attention(
+    def embedding(
         self,
-        query,
-        key,
-        value,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
+        input,
+        num_embeddings,
+        embedding_dim,
+        aggr,
+        dtype=DataType.DT_FLOAT,
+        shared_op=None,
         kernel_initializer=None,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
-        and returns the dot-product attention between them:.
-
-        :param query: the query Tensor.
-        :type query: Tensor
-
-        :param key: the key Tensor.
-        :type key: Tensor
-
-        :param value: the value Tensor.
-        :type value: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
+        """Layer that turns positive integers into dense vectors of fixed size
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
+        :type num_embeddings: int
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param embedding_dim: dimension of the dense embedding.
+        :type embedding_dim: int
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
+        :type aggr: AggrMode
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
+        :type dtype: DataType
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
@@ -2643,97 +2581,105 @@ def multihead_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        handle = ffc().flexflow_model_add_multihead_attention(
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_aggr = enum_to_int(AggrMode, aggr)
+        c_dtype = enum_to_int(DataType, dtype)
+        if kernel_initializer is None:
+            kernel_initializer = GlorotUniformInitializer(42)
+        assert (
+            (type(kernel_initializer) is GlorotUniformInitializer)
+            or (type(kernel_initializer) is ZeroInitializer)
+            or (type(kernel_initializer) is UniformInitializer)
+            or (type(kernel_initializer) is NormInitializer)
+        ), f"Unknown initializer type: {kernel_initializer}"
+        handle = ffc().flexflow_model_add_embedding(
             self.handle,
-            query.handle,
-            key.handle,
-            value.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            kernel_init_handle,
+            input.handle,
+            num_embeddings,
+            embedding_dim,
+            c_aggr,
+            c_dtype,
+            shared_op_handle,
+            kernel_initializer.handle,
             c_name,
         )
-        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
+        # NOTE: We must keep a reference to the initializer or else it will be
+        # immediately destructed
+        self.initializers[name] = kernel_initializer
+        self.add_layer(OpType.EMBEDDING, name)
+        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
 
-    def inc_multihead_self_attention(
+    def pool2d(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        pool_type=PoolType.POOL_MAX,
+        activation=ActiMode.AC_MODE_NONE,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+        """Pooling operation for 2D spatial data.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        .. math::
+          C_{out} = out\_channels
 
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
+        .. math::
+          K_{H} = kernel\_h
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        .. math::
+          K_{W} = kernel\_w
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        .. math::
+          S_{H} = stride\_h
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        .. math::
+          S_{W} = stride\_w
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        .. math::
+          P_{H} = padding\_h
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        .. math::
+          P_{S} = padding\_s
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
+        :type kernel_h: int
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
+        :type kernel_w: int
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
+
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
+
+        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
+        :type activation: PoolType
+
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2741,102 +2687,34 @@ def inc_multihead_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+        c_pool_type = enum_to_int(PoolType, pool_type)
+        c_activation = enum_to_int(ActiMode, activation)
+        handle = ffc().flexflow_model_add_pool2d(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_pool_type,
+            c_activation,
             c_name,
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
-
-    def spec_inc_multihead_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        self.add_layer(OpType.POOL2D, name)
+        return Tensor(handle, owner_op_type=OpType.POOL2D)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def batch_norm(self, input, relu=True, name=None):
+        """Layer that normalizes its inputs.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param input: the list of input Tensors.
+        :type input: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param relu: whether a ReLU function is applied. Default is True.
+        :type relu: bool
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2844,209 +2722,255 @@ def spec_inc_multihead_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
+        handle = ffc().flexflow_model_add_batch_norm(
+            self.handle, input.handle, relu, c_name
+        )
+        self.add_layer(OpType.BATCH_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
+
+    def layer_norm(
+        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
+    ):
+        """Add a LayerNorm layer
+
+        :param input: The input tensor
+        :type input: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: The LayerNorm output tensor
+        :rtype: Tensor
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_layer_norm(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
             c_name,
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.LAYER_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
 
-    def inc_multihead_self_attention_verify(
+    def residual_layer_norm(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        residual1,
+        residual2,
+        use_two_residuals,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in
+        better efficiency compared to using separate element-wise add and LayerNorm operators.
 
-        :param input: the input Tensor.
+        :param input: The input tensor
         :type input: Tensor
+        :param residual1: The residual tensor to add to the input before computing the LayerNorm
+        :type residual1: Tensor
+        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
+        :type residual2: Tensor
+        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
+        :type use_two_residuals: bool
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: List[int]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: str, optional
+        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        residual2_handle = (
+            residual1.handle
+        )  # This is intentional. Data will be ignored, and we cannot pass None
+        if use_two_residuals:
+            assert residual2 is not None
+            residual2_handle = residual2.handle
+        handles_array = ffc().flexflow_model_add_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual1.handle,
+            residual2_handle,
+            use_two_residuals,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
+        return (
+            Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM),
+            Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM),
+        )
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+    def add_bias_residual_layer_norm(
+        self,
+        input,
+        residual,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
+        name=None,
+    ):
+        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel,
+        resulting in better efficiency compared to using separate attention bias addition +
+        element-wise residual addition + LayerNorm operators.
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: The input tensor
+        :type input: Tensor
+        :param residual: The residual tensor
+        :type residual: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual.handle,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
+        return (
+            Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM),
+            Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM),
+        )
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+    def sigmoid_silu_multi(self, input1, input2, name=None):
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
+            self.handle, input1.handle, input2.handle, c_name
+        )
+        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def batch_matmul(
+        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
+    ):
+        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :param A: the first input Tensor.
+        :type A: Tensor
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param B: the second input Tensor.
+        :type B: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
+        :type a_seq_length_dim: int
+
+        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
+        :type b_seq_length_dim: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
+        :param name:  Whether to add use bias in layer normalization
+        :type name: bool
+
         :returns:  Tensor -- the output tensor.
         """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        if a_seq_length_dim is None:
+            a_seq_length_dim = -1
+        if b_seq_length_dim is None:
+            b_seq_length_dim = -1
+        handle = ffc().flexflow_model_add_batch_matmul(
+            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.BATCH_MATMUL, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
 
-    def inc_multiquery_self_attention(
+    def dense(
         self,
         input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
+        out_dim,
+        activation=ActiMode.AC_MODE_NONE,
+        use_bias=True,
+        datatype=DataType.DT_NONE,
+        shared_op=None,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        bias_initializer=None,
+        kernel_regularizer=None,
         name=None,
     ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
+        :attr:`activation` is the element-wise activation function passed as the activation argument,
+        :attr:`kernel` is a weights matrix created by the layer, and
+        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
+
+        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
+        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
-
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param out\_dim: dimensionality of the output space.
+        :type out\_dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param kernel_regularizer: Regularizer for the kernel weights matrix
+        :type bias_initializer: Regularizer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3054,107 +2978,128 @@ def inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        c_datatype = enum_to_int(DataType, datatype)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
+        if kernel_regularizer:
+            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
+            kernel_reg_lambda = kernel_regularizer._lambda
+        else:
+            c_kernel_reg_type = enum_to_int(
+                RegularizerMode, RegularizerMode.REG_MODE_NONE
+            )
+            kernel_reg_lambda = 0.0
+        handle = ffc().flexflow_model_add_dense(
+            self.handle,
+            input.handle,
+            out_dim,
+            c_activation,
+            use_bias,
+            c_datatype,
+            shared_op_handle,
             kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            bias_init_handle,
+            c_kernel_reg_type,
+            kernel_reg_lambda,
             c_name,
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.LINEAR, name)
+        return Tensor(handle, owner_op_type=OpType.LINEAR)
 
-    def spec_inc_multiquery_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
+    def concat(self, tensors, axis, name=None):
+        """Layer that concatenates a list of inputs.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param input: the list of input Tensors.
+        :type input: List of Tensors
 
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
+        :param axis: the dimension along which to concatenate.
+        :type axis: int
 
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :returns:  Tensor -- the output tensor.
+        """
+        assert type(tensors) is list, "tensors should be a list"
+        tensor_handle_list = []
+        n = len(tensors)
+        assert n <= 256, "Please increase MAX_NUM_INPUTS"
+        for tensor in tensors:
+            tensor_handle_list.append(tensor.handle)
+        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_concat(
+            self.handle, n, c_tensor_handle_list, axis, c_name
+        )
+        self.add_layer(OpType.CONCAT, name)
+        return Tensor(handle, owner_op_type=OpType.CONCAT)
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+    def split(self, input, sizes, axis, name=None):
+        """Layer that splits a :attr:`input` tensor into a list of tensors.
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
+        :type sizes: int or list of int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param axis: the dimension along which to split.
+        :type axis: int
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :returns:  list of Tensors -- the output tensors.
+        """
+        if type(sizes) is list:
+            split = sizes
+        else:
+            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
+            split = [input.dims[axis] // sizes for i in range(sizes)]
+        n = len(split)
+        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
+        c_split = ffi.new("int[]", split)
+        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
+        c_name = get_c_name(name)
+        ffc().flexflow_model_add_split(
+            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
+        )
+        output_tensor_list = []
+        for i in range(n):
+            tensor_p_handle = ffi.new("flexflow_tensor_t*")
+            tensor_p_handle.impl = c_outputs_handle_list[i].impl
+            output_tensor_list.append(
+                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
+            )
+        self.add_layer(OpType.SPLIT, name)
+        del c_outputs_handle_list
+        return output_tensor_list
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def flat(self, input, name=None):
+        """Flattens the input. Does not affect the batch size.
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
+        self.add_layer(OpType.FLAT, name)
+        return Tensor(handle, owner_op_type=OpType.FLAT)
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+    def softmax(self, input, axis=-1, name=None):
+        """Softmax activation function.
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3162,107 +3107,93 @@ def spec_inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_softmax(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.SOFTMAX, name)
+        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
 
-    def inc_multiquery_self_attention_verify(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+    def reshape(self, input, shape, name=None):
+        """Layer that reshapes inputs into the given shape.
+
+        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
+        except with a new shape given by :attr:`shape`.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param shape: A list defining the shape of the output tensor.
+        :type shape: list of int
 
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        c_shape = ffi.new("int[]", shape)
+        handle = ffc().flexflow_model_add_reshape(
+            self.handle, input.handle, len(shape), c_shape, c_name
+        )
+        self.add_layer(OpType.RESHAPE, name)
+        return Tensor(handle, owner_op_type=OpType.RESHAPE)
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+    def gather(self, input, index, dim, name=None):
+        """Layer that gathers values along the dim axis.
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param input: the input tensor
+        :type input: Tensor
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param index: the index tensor, which specifies the indices of elements to gather
+        :type index: Tensor
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param dim: the axis along which to index
+        :type dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param name: the name of the layer. Default is None
+        :type name: string
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :returns: Tensor -- the output tensor
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_gather(
+            self.handle, input.handle, index.handle, dim, c_name
+        )
+        self.add_layer(OpType.GATHER, name)
+        return Tensor(handle, owner_op_type=OpType.GATHER)
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+    def transpose(self, input, perm, name=None):
+        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param perm: A permutation of the dimensions of a.
+        :type perm: List of int
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        c_perm = ffi.new("int[]", perm)
+        handle = ffc().flexflow_model_add_transpose(
+            self.handle, input.handle, len(perm), c_perm, c_name
+        )
+        self.add_layer(OpType.TRANSPOSE, name)
+        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+    def reverse(self, input, axis, name=None):
+        """Layer that reverses specific dimensions of a tensor.
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param axis: the dimension to reverse.
+        :type axis: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3270,43 +3201,20 @@ def inc_multiquery_self_attention_verify(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_reverse(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.REVERSE, name)
+        return Tensor(handle, owner_op_type=OpType.REVERSE)
 
-    def rms_norm(self, input, eps, dim, name=None):
-        """Defines the RMS Norm layer.
+    def scalar_multiply(self, input, scalar, inplace=True, name=None):
+        """Scalar multiplication of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3314,26 +3222,20 @@ def rms_norm(self, input, eps, dim, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rms_norm(
-            self.handle, input.handle, eps, dim, c_name
+        handle = ffc().flexflow_model_add_scalar_multiply(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.RMS_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
-
-    def residual_rms_norm(self, input1, input2, eps, dim, name=None):
-        """Defines the Residual RMS Norm layer.
+        self.add_layer(OpType.SCALAR_MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
 
-        :param input: the input 1 Tensor.
-        :type input: Tensor
+    def scalar_add(self, input, scalar, inplace=True, name=None):
+        """Scalar addition of a scalar to each entry of a tensor.
 
-        :param input: the input 2 Tensor.
+        :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3341,28 +3243,20 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handles_array = ffc().flexflow_model_add_residual_rms_norm(
-            self.handle, input1.handle, input2.handle, eps, dim, c_name
-        )
-        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
-        return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
-            handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM
+        handle = ffc().flexflow_model_add_scalar_add(
+            self.handle, input.handle, scalar, inplace, c_name
         )
+        self.add_layer(OpType.SCALAR_ADD, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
 
-    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
-        """Defines the Arg TopK layer.
+    def scalar_sub(self, input, scalar, inplace=True, name=None):
+        """Scalar subtraction of a scalar to each entry of a tensor.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param k: the top k indices to select
-        :type k: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
-
-        :param speculative_decoding: Whether you need to perform beam search
-        :type speculative_decoding: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3370,23 +3264,20 @@ def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_arg_top_k(
-            self.handle, input.handle, k, sorted, c_name
+        handle = ffc().flexflow_model_add_scalar_sub(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.ARG_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
+        self.add_layer(OpType.SCALAR_SUB, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
 
-    def beam_top_k(self, input, max_beam_size, sorted, name=None):
-        """Defines the Beam TopK layer.
+    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
+        """Scalar regular division of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param max_beam_size: the top max_beam_size indices to select
-        :type max_beam_size: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3394,889 +3285,1498 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_beam_top_k(
-            self.handle, input.handle, max_beam_size, sorted, c_name
+        handle = ffc().flexflow_model_add_scalar_truediv(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.BEAM_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
+        self.add_layer(OpType.SCALAR_TRUEDIV, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
 
-    def sampling(self, input, top_p, name=None):
-        """Defines the Sampling layer.
+    def gelu(self, input, inplace=True, name=None):
+        """Gaussian Error Linear Unit activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param top_p: The top_p parameter of the sampling
-        :type top_p: float
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sampling(
-            self.handle, input.handle, top_p, c_name
-        )
-        self.add_layer(OpType.SAMPLING, name)
-        return Tensor(handle, owner_op_type=OpType.SAMPLING)
+        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
+        self.add_layer(OpType.GELU, name)
+        return Tensor(handle, owner_op_type=OpType.GELU)
 
-    def argmax(self, input, beam_search, name=None):
-        """Defines the Sampling layer.
+    def relu(self, input, inplace=True, name=None):
+        """Rectified Linear Unit activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param beam_search: Whether you need to perform beam search
-        :type beam_search: bool
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_argmax(
-            self.handle, input.handle, beam_search, c_name
+        handle = ffc().flexflow_model_add_relu(
+            self.handle, input.handle, inplace, c_name
         )
-        self.add_layer(OpType.ARGMAX, name)
-        return Tensor(handle, owner_op_type=OpType.ARGMAX)
+        self.add_layer(OpType.RELU, name)
+        return Tensor(handle, owner_op_type=OpType.RELU)
 
-    def reset_metrics(self):
-        """Reset performance metrics.
+    def identity(self, input, name=None):
+        """Identity function.
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_reset_metrics(self.handle)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def init_layers(self):
-        """Initialize layers.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_init_layers(self.handle)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
+        self.add_layer(OpType.IDENTITY, name)
+        return Tensor(handle, owner_op_type=OpType.IDENTITY)
 
-    def prefetch(self):
-        ffc().flexflow_model_prefetch(self.handle)
+    def sigmoid(self, input, name=None):
+        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
 
-    def forward(self, seq_length=None):
-        """Forward propagation of all layers.
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :returns:  None -- no returns.
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_forward(self.handle, seq_length)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
+        self.add_layer(OpType.SIGMOID, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID)
 
-    # TODO: seperate compute_metrics from backward
-    def backward(self, seq_length=None):
-        """Backward propagation of all layers.
+    def tanh(self, input, name=None):
+        """Hyperbolic tangent activation function.
 
-        :returns:  None -- no returns.
-        """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_backward(self.handle, seq_length)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def compute_metrics(self):
-        """Compute performance metrics.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_compute_metrics(self.handle)
-
-    def update(self):
-        """Update weights and biases of all layers.
-
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_update(self.handle)
-
-    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
-        """Configure the model for trainting. FlexFlow uses lazy initialization,
-        so the actual creating of all operations (including creating and partitioning
-        of weight, bias and output tensors) happen during compile.
-
-        :param optimizer: optimizer instance.
-        :type optimizer: Optimizer
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
+        self.add_layer(OpType.TANH, name)
+        return Tensor(handle, owner_op_type=OpType.TANH)
 
-        :param loss_type: Enum of LossType.
-          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
-        :type loss_type: LossType
+    def elu(self, input, inplace=True, name=None):
+        """Exponential Linear Unit. activation function.
 
-        :param metrics: List of metrics to be evaluated by the model during training and testing.
-          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
-          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
-        :type metrics: MetricsType
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param comp_mode: Enum of CompMode.
-          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
-        :type comp_mode: CompMode
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        self.optimizer = optimizer
-
-        c_loss_type = enum_to_int(LossType, loss_type)
-        metrics_int = []
-        for metric in metrics:
-            metrics_int.append(enum_to_int(MetricsType, metric))
-        c_metrics = ffi.new("int[]", metrics_int)
-        if comp_mode == None:
-            comp_mode = CompMode.TRAINING
-        c_comp_mode = enum_to_int(CompMode, comp_mode)
-        ffc().flexflow_model_compile(
-            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_elu(
+            self.handle, input.handle, inplace, c_name
         )
-        for ff_tensor, np_tensor in self.attr_tensors.items():
-            ff_tensor.set_tensor(self, np_tensor)
-        print("Compiled ffmodel!")
-
-    def fit(self, x=None, y=None, batch_size=None, epochs=1):
-        """Trains the model for a fixed number of epochs (iterations on a dataset).
-
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
-
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
-
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
-
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
-
-        :returns:  None -- no returns.
-        """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
-
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for epoch in range(0, epochs):
-            for d in dataloaders:
-                d.reset()
-            self.reset_metrics()
-            iterations = num_samples / batch_size
-            for iter in range(0, int(iterations)):
-                self._ffconfig.begin_trace(self._tracing_id)
-                for d in dataloaders:
-                    d.next_batch(self)
-                self.forward()
-                self.zero_gradients()
-                self.backward()
-                self.update()
-                self._ffconfig.end_trace(self._tracing_id)
+        self.add_layer(OpType.ELU, name)
+        return Tensor(handle, owner_op_type=OpType.ELU)
 
-    def eval(self, x=None, y=None, batch_size=None):
-        """Returns the loss value & metrics values for the model in test mode.
+    def dropout(self, input, rate, seed, name=None):
+        """The Dropout layer randomly sets input units to 0 with
+        a frequency of :attr:`rate` at each step during training time,
+        which helps prevent overfitting.
+        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
+        sum over all inputs is unchanged.
 
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
+        :param rate: Fraction of the input units to drop.
+        :type rate: float(0-1)
 
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
+        :param seed: random seed.
+        :type seed: int
 
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
-
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        for d in dataloaders:
-            d.reset()
-        self.reset_metrics()
-        iterations = num_samples / batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for iter in range(0, int(iterations)):
-            for d in dataloaders:
-                d.next_batch(self)
-            self._ffconfig.begin_trace(self._tracing_id)
-            self.forward()
-            self.compute_metrics()
-            self._ffconfig.end_trace(self._tracing_id)
-
-    def zero_gradients(self):
-        """Empty the gradients of all layers.
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_dropout(
+            self.handle, input.handle, rate, seed, c_name
+        )
+        self.add_layer(OpType.DROPOUT, name)
+        return Tensor(handle, owner_op_type=OpType.DROPOUT)
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_zero_gradients(self.handle)
+    def multihead_attention(
+        self,
+        query,
+        key,
+        value,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kernel_initializer=None,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
+        and returns the dot-product attention between them:.
 
-    def set_optimizer(self, optimizer):
-        if isinstance(optimizer, SGDOptimizer) == True:
-            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
-        elif isinstance(optimizer, AdamOptimizer) == True:
-            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
-        elif optimizer == None:
-            pass
-        else:
-            assert 0, "[Model]: unknown optimizer"
+        :param query: the query Tensor.
+        :type query: Tensor
 
-    optimizer = property(fset=set_optimizer)
+        :param key: the key Tensor.
+        :type key: Tensor
 
-    def print_layers(self, id=-1):
-        ffc().flexflow_model_print_layers(self.handle, id)
+        :param value: the value Tensor.
+        :type value: Tensor
 
-    def get_layer_by_id(self, layer_id):
-        return self._layers[layer_id]
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-    def get_last_layer(self):
-        return self._layers[self._nb_layers - 1]
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-    def get_layer_by_name(self, layer_name):
-        for layer_id in self._layers:
-            layer = self._layers[layer_id]
-            if layer.name == layer_name:
-                return layer
-        assert 0, f"Cannot find the layer with name {layer_name}"
-        return None
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def get_tensor_by_id(self, id):
-        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
-        return Parameter(handle)
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-    @property
-    def label_tensor(self):
-        handle = ffc().flexflow_model_get_label_tensor(self.handle)
-        return Tensor(handle, deallocate=False)
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-    def get_perf_metrics(self):
-        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
-        return PerfMetrics(handle)
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-    def set_transformer_layer_id(self, id):
-        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-    def create_data_loader(self, batch_tensor, full_array):
-        """Create a SingleDataloader instance.
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
-        :type batch_tensor: Tensor
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        :param full_array: the entire data.
-        :type full_array: Numpy Array
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  SingleDataloader -- returns a dataloader instance.
+        :returns:  Tensor -- the output tensor.
         """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        handle = ffc().flexflow_model_add_multihead_attention(
+            self.handle,
+            query.handle,
+            key.handle,
+            value.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            kernel_init_handle,
+            c_name,
+        )
+        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
 
-        if self._ffconfig.enable_control_replication:
-            assert (
-                self._ffconfig.python_data_loader_type != 1
-            ), "To enable control replication, please set --python-data-loader-type 2"
-            return self.__create_data_loader_ptr(batch_tensor, full_array)
-        else:
-            if self._ffconfig.python_data_loader_type == 1:
-                return self.__create_data_loader_attach(batch_tensor, full_array)
-            else:
-                return self.__create_data_loader_ptr(batch_tensor, full_array)
-
-    def __create_data_loader_attach(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        num_dim = len(full_array_shape)
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
+    def inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+
+    def spec_inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multihead_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
+
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+
+    def spec_inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
 
-        if num_dim == 2:
-            full_tensor = self.create_tensor(
-                [num_samples, full_array_shape[1]], datatype
-            )
-            self.map_tensor(full_tensor)
-        elif num_dim == 4:
-            full_tensor = self.create_tensor(
-                [
-                    num_samples,
-                    full_array_shape[1],
-                    full_array_shape[2],
-                    full_array_shape[3],
-                ],
-                datatype,
-            )
-            self.map_tensor(full_tensor)
-        else:
-            assert 0, "unsupported dims"
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        full_tensor.attach_numpy_array(self._ffconfig, full_array)
-        dataloader = SingleDataLoader(
-            self, batch_tensor, full_tensor, num_samples, datatype
-        )
-        full_tensor.detach_numpy_array(self._ffconfig)
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-        return dataloader
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
-    def __create_data_loader_ptr(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
-        np_raw_ptr = full_array.__array_interface__["data"]
-        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-        print(
-            "numpy array: %s, %s, %s"
-            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
-        )
-        dataloader = SingleDataLoader(
-            self, batch_tensor, raw_ptr, num_samples, datatype
-        )
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
-        return dataloader
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def __get_initializer_handle(self, initializer):
-        if initializer == None:
-            null_initializer = Initializer(None)
-            return null_initializer.handle
-        else:
-            return initializer.handle
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-    def __get_op_handle(self, shared_op):
-        if shared_op == None:
-            op_handle = ffi.new("flexflow_op_t *")
-            op_handle.impl = ffi.NULL
-            op = Op(op_handle[0])
-        else:
-            op = shared_op
-        return op.handle
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-    def get_output_tensor(self, ffmodel, data_type):
-        shape = self.dims
-        if data_type == DataType.DT_HALF:
-            np_array = np.empty(shape, dtype=np.float16)
-        elif data_type == DataType.DT_FLOAT:
-            np_array = np.empty(shape, dtype=np.float32)
-        elif self.data_type == DataType.DT_INT32:
-            np_array = np.empty(shape, dtype=np.int32)
-        elif self.data_type == DataType.DT_INT64:
-            np_array = np.empty(shape, dtype=np.int64)
-        else:
-            assert 0, f"Unsupported datatype: {self.data_type}"
-        np_raw_ptr = np_array.__array_interface__["data"]
-        if np_array.dtype == np.float32:
-            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_float(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int32:
-            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int64:
-            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int64(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        fflogger.debug(
-            "get weights raw_ptr: %s, %s, %s, %s"
-            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
-        )
-        assert ret_val == True
-        return np_array
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-    def generate(self, prompt_list, max_sequence_length):
-        assert isinstance(prompt_list, list)
-        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
-        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
-        c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
-        ffc().flexflow_model_generate(
-            self.handle,
-            len(prompt_list),
-            c_input_texts,
-            max_num_chars,
-            c_output_texts,
-            max_sequence_length,
-            c_output_length_and_tokens,
-        )
-        #output_length = c_output_length_and_tokens[0]
-        #output_tokens = []
-        #for i in range(output_length):
-        #    output_tokens.append(c_output_length_and_tokens[i + 1])
-        from flexflow.serve import GenerationResult
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-        return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-    def set_position_offset(self, offset):
-        ffc().flexflow_model_set_position_offset(self.handle, offset)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-# -----------------------------------------------------------------------
-# SGDOptimizer
-# -----------------------------------------------------------------------
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-class SGDOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-    def __init__(
-        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
-    ):
-        self.handle = ffc().flexflow_sgd_optimizer_create(
-            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
+    def inc_multiquery_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-# -----------------------------------------------------------------------
-# AdamOptimizer
-# -----------------------------------------------------------------------
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
-class AdamOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def __init__(
-        self,
-        ffmodel,
-        alpha=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        weight_decay=0.0,
-        epsilon=1e-8,
-    ):
-        self.handle = ffc().flexflow_adam_optimizer_create(
-            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
-        )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-# -----------------------------------------------------------------------
-# Initializer
-# -----------------------------------------------------------------------
-class Initializer(object):
-    __slots__ = ["handle", "p_handle"]
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-    def __init__(self, handle, p_handle=0):
-        self.p_handle = ffi.new("flexflow_initializer_t *")
-        if handle == None:
-            self.p_handle.impl = ffi.NULL
-        else:
-            self.p_handle.impl = handle.impl
-        self.handle = self.p_handle[0]
-        assert ffi.typeof(self.handle) == ffi.typeof(
-            "flexflow_initializer_t"
-        ), "Initializer handle is wrong"
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-# -----------------------------------------------------------------------
-# GlorotUniform
-# -----------------------------------------------------------------------
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-class GlorotUniformInitializer(Initializer):
-    __slots__ = ["glorot_handle", "_glorot_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self, seed):
-        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
-        self._glorot_handle = ffi.gc(
-            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
+    def rms_norm(self, input, eps, dim, name=None):
+        """Defines the RMS Norm layer.
 
-# -----------------------------------------------------------------------
-# ZeroInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
 
-class ZeroInitializer(Initializer):
-    __slots__ = ["zero_handle", "_zero_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self):
-        self.zero_handle = ffc().flexflow_zero_initializer_create()
-        self._zero_handle = ffi.gc(
-            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_rms_norm(
+            self.handle, input.handle, eps, dim, c_name
         )
-        super(ZeroInitializer, self).__init__(self.zero_handle)
+        self.add_layer(OpType.RMS_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
 
+    def residual_rms_norm(
+        self, input1, input2, eps, dim, inplace_residual=False, name=None
+    ):
+        """Defines the Residual RMS Norm layer.
 
-# -----------------------------------------------------------------------
-# UniformInitializer
-# -----------------------------------------------------------------------
+        :param input: the input 1 Tensor.
+        :type input: Tensor
 
+        :param input: the input 2 Tensor.
+        :type input: Tensor
 
-class UniformInitializer(Initializer):
-    __slots__ = ["uniform_handle", "_uniform_handle"]
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
-    def __init__(self, seed, minv, maxv):
-        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
-            seed, minv, maxv
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
+        :type inplace_residual: bool
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handles_array = ffc().flexflow_model_add_residual_rms_norm(
+            self.handle,
+            input1.handle,
+            input2.handle,
+            eps,
+            dim,
+            inplace_residual,
+            c_name,
         )
-        self._uniform_handle = ffi.gc(
-            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
+        return (
+            Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM),
+            Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM),
         )
-        super(UniformInitializer, self).__init__(self.uniform_handle)
 
+    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
+        """Defines the Arg TopK layer.
 
-# -----------------------------------------------------------------------
-# NormInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param k: the top k indices to select
+        :type k: int
 
-class NormInitializer(Initializer):
-    __slots__ = ["norm_handle", "_norm_handle"]
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
-    def __init__(self, seed, mean, stddev):
-        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
-        self._norm_handle = ffi.gc(
-            self.norm_handle, ffc().flexflow_norm_initializer_destroy
-        )
-        super(NormInitializer, self).__init__(self.norm_handle)
+        :param speculative_decoding: Whether you need to perform beam search
+        :type speculative_decoding: bool
 
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-# -----------------------------------------------------------------------
-# PerfMetrics
-# -----------------------------------------------------------------------
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_arg_top_k(
+            self.handle, input.handle, k, sorted, c_name
+        )
+        self.add_layer(OpType.ARG_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
 
+    def beam_top_k(self, input, max_beam_size, sorted, name=None):
+        """Defines the Beam TopK layer.
 
-class PerfMetrics(object):
-    __slots__ = ["handle", "_handle"]
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def __init__(self, handle):
-        self.handle = handle
-        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
+        :param max_beam_size: the top max_beam_size indices to select
+        :type max_beam_size: int
 
-    def get_accuracy(self):
-        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-# -----------------------------------------------------------------------
-# NetConfig
-# -----------------------------------------------------------------------
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_beam_top_k(
+            self.handle, input.handle, max_beam_size, sorted, c_name
+        )
+        self.add_layer(OpType.BEAM_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
 
+    def sampling(self, input, top_p, name=None):
+        """Defines the Sampling layer.
 
-class NetConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_net_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
-        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cpath)
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param top_p: The top_p parameter of the sampling
+        :type top_p: float
 
-# -----------------------------------------------------------------------
-# DLRMConfig
-# -----------------------------------------------------------------------
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sampling(
+            self.handle, input.handle, top_p, c_name
+        )
+        self.add_layer(OpType.SAMPLING, name)
+        return Tensor(handle, owner_op_type=OpType.SAMPLING)
 
-class DLRMConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_dlrm_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
+    def argmax(self, input, beam_search, name=None):
+        """Defines the Sampling layer.
 
-        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cstr)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
-        self.arch_interaction_op = ffi.string(cstr)
+        :param beam_search: Whether you need to perform beam search
+        :type beam_search: bool
 
-        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
-            self.handle
-        )
-        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
-        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
-        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
-            self.handle
-        )
-        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
-        self.mlp_bot = []
-        for i in range(0, mlp_bot_c[0]):
-            self.mlp_bot.append(mlp_bot_c[i + 1])
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_argmax(
+            self.handle, input.handle, beam_search, c_name
+        )
+        self.add_layer(OpType.ARGMAX, name)
+        return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
-        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
-        self.mlp_top = []
-        for i in range(0, mlp_top_c[0]):
-            self.mlp_top.append(mlp_top_c[i + 1])
+    def add_lora_layer(self, peft_config):
+        return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle)
 
-        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
-        self.embedding_size = []
-        for i in range(0, embedding_size_c[0]):
-            self.embedding_size.append(embedding_size_c[i + 1])
+    def reset_metrics(self):
+        """Reset performance metrics.
 
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_reset_metrics(self.handle)
 
-# -----------------------------------------------------------------------
-# Single DataLoader
-# -----------------------------------------------------------------------
+    def init_layers(self):
+        """Initialize layers.
 
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_init_layers(self.handle)
 
-class SingleDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+    def prefetch(self):
+        ffc().flexflow_model_prefetch(self.handle)
 
-    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
-        assert type(input) is Tensor, "SingleDataLoader input is wrong"
-        if type(full_input) is Tensor:
-            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
-        else:
-            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
+    def forward(self, seq_length=None):
+        """Forward propagation of all layers.
 
-    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create(
-            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
-        )
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_forward(self.handle, seq_length)
 
-    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
-        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create2(
-            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
-        )
+    # TODO: seperate compute_metrics from backward
+    def backward(self, seq_length=None):
+        """Backward propagation of all layers.
 
-    @property
-    def num_samples(self):
-        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_backward(self.handle, seq_length)
 
-    @num_samples.setter
-    def num_samples(self, samples):
-        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
+    def compute_metrics(self):
+        """Compute performance metrics.
 
-    def next_batch(self, ffmodel):
-        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_compute_metrics(self.handle)
+
+    def update(self):
+        """Update weights and biases of all layers.
 
         :returns:  None -- no returns.
         """
-        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
+        ffc().flexflow_model_update(self.handle)
 
-    def reset(self):
-        """Reset the current position of the dataloder to 0.
+    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
+        """Configure the model for trainting. FlexFlow uses lazy initialization,
+        so the actual creating of all operations (including creating and partitioning
+        of weight, bias and output tensors) happen during compile.
+
+        :param optimizer: optimizer instance.
+        :type optimizer: Optimizer
+
+        :param loss_type: Enum of LossType.
+          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
+        :type loss_type: LossType
+
+        :param metrics: List of metrics to be evaluated by the model during training and testing.
+          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
+          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
+        :type metrics: MetricsType
+
+        :param comp_mode: Enum of CompMode.
+          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
+        :type comp_mode: CompMode
 
         :returns:  None -- no returns.
         """
-        ffc().flexflow_single_dataloader_reset(self.handle)
+        self.optimizer = optimizer
 
+        c_loss_type = enum_to_int(LossType, loss_type)
+        metrics_int = []
+        for metric in metrics:
+            metrics_int.append(enum_to_int(MetricsType, metric))
+        c_metrics = ffi.new("int[]", metrics_int)
+        if comp_mode == None:
+            comp_mode = CompMode.TRAINING
+        c_comp_mode = enum_to_int(CompMode, comp_mode)
+        ffc().flexflow_model_compile(
+            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        )
+        for ff_tensor, np_tensor in self.attr_tensors.items():
+            ff_tensor.set_tensor(self, np_tensor)
+        print("Compiled ffmodel!")
 
-class RegionNdarray(object):
-    __slots__ = ["__array_interface__"]
+    def fit(self, x=None, y=None, batch_size=None, epochs=1):
+        """Trains the model for a fixed number of epochs (iterations on a dataset).
 
-    def __init__(self, shape, data_type, base_ptr, strides, read_only):
-        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
-        if data_type == DataType.DT_HALF:
-            field_type = "<f2"
-        elif data_type == DataType.DT_FLOAT:
-            field_type = "<f4"
-        elif data_type == DataType.DT_INT32:
-            field_type = "<i4"
-        else:
-            assert 0, "unknown data type"
-            field_type = "<f4"
-        self.__array_interface__ = {
-            "version": 3,
-            "shape": shape,
-            "typestr": field_type,
-            "data": (base_ptr, read_only),
-            "strides": strides,
-        }
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
-# -----------------------------------------------------------------------
-# BatchConfig
-# -----------------------------------------------------------------------
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
 
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
+        else:
+            dataloaders = x
+        dataloaders.append(y)
 
-    def __init__(self):
-        self.handle = ffc().flexflow_batch_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for epoch in range(0, epochs):
+            for d in dataloaders:
+                d.reset()
+            self.reset_metrics()
+            iterations = num_samples / batch_size
+            for iter in range(0, int(iterations)):
+                self._ffconfig.begin_trace(self._tracing_id)
+                for d in dataloaders:
+                    d.next_batch(self)
+                self.forward()
+                self.zero_gradients()
+                self.backward()
+                self.update()
+                self._ffconfig.end_trace(self._tracing_id)
 
+    def eval(self, x=None, y=None, batch_size=None):
+        """Returns the loss value & metrics values for the model in test mode.
 
-# -----------------------------------------------------------------------
-# TreeVerifyBatchConfig
-# -----------------------------------------------------------------------
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
-class TreeVerifyBatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
 
-    def __init__(self):
-        self.handle = ffc().flexflow_tree_verify_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
-        )
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
 
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
+        else:
+            dataloaders = x
+        dataloaders.append(y)
 
-# -----------------------------------------------------------------------
-# BeamSearchBatchConfig
-# -----------------------------------------------------------------------
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        for d in dataloaders:
+            d.reset()
+        self.reset_metrics()
+        iterations = num_samples / batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for iter in range(0, int(iterations)):
+            for d in dataloaders:
+                d.next_batch(self)
+            self._ffconfig.begin_trace(self._tracing_id)
+            self.forward()
+            self.compute_metrics()
+            self._ffconfig.end_trace(self._tracing_id)
 
+    def zero_gradients(self):
+        """Empty the gradients of all layers.
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_zero_gradients(self.handle)
 
-    def __init__(self):
-        self.handle = ffc().flexflow_beam_search_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_beam_search_batch_config_destroy
-        )
+    def set_optimizer(self, optimizer):
+        if isinstance(optimizer, SGDOptimizer) == True:
+            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
+        elif isinstance(optimizer, AdamOptimizer) == True:
+            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
+        elif optimizer == None:
+            pass
+        else:
+            assert 0, "[Model]: unknown optimizer"
+
+    optimizer = property(fset=set_optimizer)
+
+    def print_layers(self, id=-1):
+        ffc().flexflow_model_print_layers(self.handle, id)
 
+    def get_layer_by_id(self, layer_id):
+        return self._layers[layer_id]
 
-# -----------------------------------------------------------------------
-# RequestManager
-# -----------------------------------------------------------------------
+    def get_last_layer(self):
+        return self._layers[self._nb_layers - 1]
 
+    def get_layer_by_name(self, layer_name):
+        for layer_id in self._layers:
+            layer = self._layers[layer_id]
+            if layer.name == layer_name:
+                return layer
+        assert 0, f"Cannot find the layer with name {layer_name}"
+        return None
 
-class RequestManager(object):
-    __slots__ = ["handle"]
+    def get_tensor_by_id(self, id):
+        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
+        return Parameter(handle)
 
-    def __init__(self):
-        self.handle = ffc().flexflow_request_manager_get_request_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
+    @property
+    def label_tensor(self):
+        handle = ffc().flexflow_model_get_label_tensor(self.handle)
+        return Tensor(handle, deallocate=False)
 
-    def register_tokenizer(
-        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
-    ):
-        c_model_type = enum_to_int(ModelType, model_type)
-        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
-        return ffc().flexflow_request_manager_register_tokenizer(
-            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
-        )
+    def get_perf_metrics(self):
+        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
+        return PerfMetrics(handle)
 
-    def register_output_filepath(self, output_filepath):
-        c_output_filepath = get_c_name(output_filepath)
-        return ffc().flexflow_request_manager_register_output_filepath(
-            self.handle, c_output_filepath
-        )
+    def set_transformer_layer_id(self, id):
+        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
 
-    def register_ssm_model(self, model):
-        return ffc().flexflow_request_manager_register_ssm_model(
-            self.handle, model.handle
-        )
+    def create_data_loader(self, batch_tensor, full_array):
+        """Create a SingleDataloader instance.
 
-    def set_max_requests_per_batch(self, max_requests):
-        return ffc().flexflow_request_manager_set_max_requests_per_batch(
-            self.handle, max_requests)
-    
-    def set_max_tokens_per_batch(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
-            self.handle, max_tokens)
-    
-    def set_max_spec_tree_token_num(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
-            self.handle, max_tokens)
-    
-    def set_max_sequence_length(self, max_length):
-        return ffc().flexflow_request_manager_set_max_sequence_length(
-            self.handle, max_length)
+        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
+        :type batch_tensor: Tensor
 
-    def start_server(self, model):
-        return ffc().flexflow_request_manager_start_background_server(
-            self.handle, model.handle
-        )
+        :param full_array: the entire data.
+        :type full_array: Numpy Array
 
-    def stop_server(self):
-        return ffc().flexflow_request_manager_terminate_background_server(
-            self.handle)
-# -----------------------------------------------------------------------
-# InferenceManager
-# -----------------------------------------------------------------------
+        :returns:  SingleDataloader -- returns a dataloader instance.
+        """
 
+        if self._ffconfig.enable_control_replication:
+            assert (
+                self._ffconfig.python_data_loader_type != 1
+            ), "To enable control replication, please set --python-data-loader-type 2"
+            return self.__create_data_loader_ptr(batch_tensor, full_array)
+        else:
+            if self._ffconfig.python_data_loader_type == 1:
+                return self.__create_data_loader_attach(batch_tensor, full_array)
+            else:
+                return self.__create_data_loader_ptr(batch_tensor, full_array)
 
-class InferenceManager(object):
-    __slots__ = ["handle"]
+    def __create_data_loader_attach(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        num_dim = len(full_array_shape)
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
 
-    def __init__(self):
-        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
+        if num_dim == 2:
+            full_tensor = self.create_tensor(
+                [num_samples, full_array_shape[1]], datatype
+            )
+            self.map_tensor(full_tensor)
+        elif num_dim == 4:
+            full_tensor = self.create_tensor(
+                [
+                    num_samples,
+                    full_array_shape[1],
+                    full_array_shape[2],
+                    full_array_shape[3],
+                ],
+                datatype,
+            )
+            self.map_tensor(full_tensor)
+        else:
+            assert 0, "unsupported dims"
 
-    def compile_model_and_allocate_buffer(self, model):
-        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
-            self.handle, model.handle
+        full_tensor.attach_numpy_array(self._ffconfig, full_array)
+        dataloader = SingleDataLoader(
+            self, batch_tensor, full_tensor, num_samples, datatype
         )
+        full_tensor.detach_numpy_array(self._ffconfig)
 
-    def init_operators_inference(self, model):
-        ffc().flexflow_inference_manager_init_operators_inference(
-            self.handle, model.handle
-        )
+        return dataloader
 
-    def register_model_weights_loader(self, model, fileloader):
-        ffc().flexflow_inference_manager_register_model_weights_loader(
-            self.handle, model.handle, fileloader.handle
+    def __create_data_loader_ptr(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
+        np_raw_ptr = full_array.__array_interface__["data"]
+        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+        print(
+            "numpy array: %s, %s, %s"
+            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
+        )
+        dataloader = SingleDataLoader(
+            self, batch_tensor, raw_ptr, num_samples, datatype
         )
 
-# -----------------------------------------------------------------------
-# FileDataLoader
-# -----------------------------------------------------------------------
+        return dataloader
 
+    def __get_initializer_handle(self, initializer):
+        if initializer == None:
+            null_initializer = Initializer(None)
+            return null_initializer.handle
+        else:
+            return initializer.handle
 
-class FileDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+    def __get_op_handle(self, shared_op):
+        if shared_op == None:
+            op_handle = ffi.new("flexflow_op_t *")
+            op_handle.impl = ffi.NULL
+            op = Op(op_handle[0])
+        else:
+            op = shared_op
+        return op.handle
 
-    def __init__(
-        self,
-        weight_file_path,
-        num_q_heads,
-        num_kv_heads,
-        hidden_dim,
-        qkv_inner_dim,
-        tensor_parallelism_degree,
-        use_full_precision
-    ):
-        c_weight_file_path = get_c_name(weight_file_path)
-        self.handle = ffc().flexflow_file_data_loader_create(
-            c_weight_file_path,
-            num_q_heads,
-            num_kv_heads,
-            hidden_dim,
-            qkv_inner_dim,
-            tensor_parallelism_degree,
-            use_full_precision
+    def get_output_tensor(self, ffmodel, data_type):
+        shape = self.dims
+        if data_type == DataType.DT_HALF:
+            np_array = np.empty(shape, dtype=np.float16)
+        elif data_type == DataType.DT_FLOAT:
+            np_array = np.empty(shape, dtype=np.float32)
+        elif self.data_type == DataType.DT_INT32:
+            np_array = np.empty(shape, dtype=np.int32)
+        elif self.data_type == DataType.DT_INT64:
+            np_array = np.empty(shape, dtype=np.int64)
+        else:
+            assert 0, f"Unsupported datatype: {self.data_type}"
+        np_raw_ptr = np_array.__array_interface__["data"]
+        if np_array.dtype == np.float32:
+            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_float(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int32:
+            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int64:
+            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int64(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        fflogger.debug(
+            "get weights raw_ptr: %s, %s, %s, %s"
+            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
+        assert ret_val == True
+        return np_array
 
-    def load_weights(self, model):
-        # Check data type and create use_full_precision boolean
-        #assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
-        #use_full_precision = data_type == DataType.DT_FLOAT
-        ffc().flexflow_file_data_loader_load_weights(
-            self.handle, model.handle
+    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
+        assert isinstance(prompt_list, list)
+        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
+        max_num_chars = 5 * (max_sequence_length + 100)
+        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, RequestType.REQ_INFERENCE)
+            for prompt in prompt_list
+        ]
+        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
+        dataset_filepaths = [ffi.NULL for prompt in prompt_list]
+        training_steps = [0 for prompt in prompt_list]
+        num_finetuning_losses = ffi.new("int *")
+        c_finetuning_losses = ffi.new("float[]", 0)
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(prompt_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+            num_finetuning_losses,
+            c_finetuning_losses,
+        )
+        from flexflow.serve import GenerationResult
+
+        return [
+            GenerationResult(
+                text=ffi.string(c_output_text), tokens=[], finetuning_losses=[]
+            )
+            for c_output_text in c_output_texts
+        ]
+
+    def generate(self, requests_list: List[Request]):
+        assert isinstance(requests_list, list)
+        c_input_texts = [
+            get_c_name(request.prompt) for request in requests_list
+        ]  # entry will be None for finetuning requests
+        c_output_texts = [
+            (
+                ffi.new("char[]", 5 * (request.max_sequence_length + 100))
+                if request.req_type == RequestType.REQ_INFERENCE
+                else ffi.NULL
+            )
+            for request in requests_list
+        ]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", request.max_sequence_length + 100)
+            for request in requests_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, request.req_type) for request in requests_list
+        ]
+        max_sequence_lengths = [
+            request.max_sequence_length for request in requests_list
+        ]
+        peft_model_ids = [
+            (
+                request.peft_model_id
+                if request.peft_model_id is not None
+                else PEFTModelID.no_id_handle()
+            )
+            for request in requests_list
+        ]
+        dataset_filepaths = [
+            get_c_name(request.dataset_filepath) for request in requests_list
+        ]
+        training_steps = [request.max_training_steps for request in requests_list]
+        num_finetuning_losses = ffi.new("int *")
+        # c_finetuning_losses = ffi.new("float**")
+        # TODO: set this value automatically
+        c_finetuning_losses = ffi.new("float[]", 10000)
+        
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(requests_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+            num_finetuning_losses,
+            c_finetuning_losses,
         )
+        finetuning_losses = []
+        if num_finetuning_losses[0] > 0:
+            finetuning_losses = [
+                c_finetuning_losses[i] for i in range(num_finetuning_losses[0])
+            ]
+        results = []
+        for c_output_text in c_output_texts:
+            results.append(
+                GenerationResult(
+                    text=(
+                        ffi.string(c_output_text) if c_output_text != ffi.NULL else None
+                    ),
+                    tokens=[],
+                    finetuning_losses=finetuning_losses,
+                )
+            )
+        return results
+
+    def set_position_offset(self, offset):
+        ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 5af077273d..fd29080a6a 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -15,7 +15,16 @@
 from typing import Optional
 from ..type import *
 from flexflow.core import *
-from .serve import LLM, SSM, GenerationConfig, GenerationResult
+from .serve import (
+    LLM,
+    SSM,
+    GenerationConfig,
+    GenerationResult,
+    LoraLinearConfig,
+    PEFTModelID,
+    Request,
+    RequestType,
+)
 
 
 def __check_positive_int(configs_dict: dict, key: str):
@@ -44,6 +53,9 @@ def init(
     offload_reserve_space_size: Optional[int] = None,
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
+    enable_peft: Optional[bool] = None,
+    peft_activation_reserve_space_size: Optional[int] = None,
+    peft_weight_reserve_space_size: Optional[int] = None,
     profiling: Optional[bool] = None,
     benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
@@ -69,9 +81,12 @@ def init(
     - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1
     - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1
     - offload: whether to enable offloading of the weights to CPU, defaults to False
-    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
+    - enable_peft: whether to enable the use of PEFT, defaults to False
+    - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -100,12 +115,18 @@ def init(
     :type pipeline_parallelism_degree: Optional[int], optional
     :param offload: whether to enable offloading of the weights to CPU, defaults to False
     :type offload: Optional[bool], optional
-    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     :type offload_reserve_space_size: Optional[int], optional
     :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     :type use_4bit_quantization: Optional[bool], optional
     :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     :type use_8bit_quantization: Optional[bool], optional
+    :param enable_peft: whether to enable the use of PEFT, defaults to False
+    :type enable_peft: Optional[bool], optional
+    :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    :type peft_activation_reserve_space_size: Optional[int], optional
+    :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
+    :type peft_weight_reserve_space_size: Optional[int], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
@@ -135,6 +156,9 @@ def init(
             offload_reserve_space_size is not None,
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
+            enable_peft is not None,
+            peft_activation_reserve_space_size is not None,
+            peft_weight_reserve_space_size is not None,
             profiling is not None,
             benchmarking is not None,
             inference_debugging is not None,
@@ -161,6 +185,9 @@ def init(
             "offload_reserve_space_size": offload_reserve_space_size,
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
+            "enable_peft": enable_peft,
+            "peft_activation_reserve_space_size": peft_activation_reserve_space_size,
+            "peft_weight_reserve_space_size": peft_weight_reserve_space_size,
             "profiling": profiling,
             "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
@@ -182,6 +209,8 @@ def init(
         "tensor_parallelism_degree",
         "pipeline_parallelism_degree",
         "offload_reserve_space_size",
+        "peft_activation_reserve_space_size",
+        "peft_weight_reserve_space_size",
     ]
     for param in positive_int_params:
         __check_positive_int(configs_dict, param)
@@ -200,11 +229,17 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024**2
+        configs_dict["offload_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
         configs_dict["use_8bit_quantization"] = False
+    if configs_dict.get("enable_peft", None) is None:
+        configs_dict["enable_peft"] = False
+    if configs_dict.get("peft_activation_reserve_space_size", None) is None:
+        configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3
+    if configs_dict.get("peft_weight_reserve_space_size", None) is None:
+        configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("benchmarking", None) is None:
diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index e7f3914037..17bb894250 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -32,5 +32,8 @@ def __init__(
     def build_model(self):
         assert False, "Not implemented yet"
 
+    def convert_hf_weight_name(name):
+        assert False, "Not implemented yet"
+
     def convert_hf_model(model, dst_folder):
         assert False, "Not implemented yet"
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 7a55da26ef..0e8fbcbd7d 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -124,7 +124,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, att_norm = ffmodel.residual_layer_norm(
@@ -135,7 +135,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -153,7 +153,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -170,7 +170,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -187,7 +187,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
@@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size * 4,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_h_to_4h",
+                name=f"layers.{i}.mlp.dense_h_to_4h",
             )
             dense_h_to_4h = ffmodel.gelu(dense_h_to_4h)
             mlp_output = ffmodel.dense(
@@ -205,7 +205,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_4h_to_h",
+                name=f"layers.{i}.mlp.dense_4h_to_h",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -239,10 +239,18 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (name.replace("transformer.h.", "layers.")
+            .replace("transformer.", "")
+            .replace("self_attention.dense", "self_attention.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         n_head = (
@@ -251,17 +259,12 @@ def convert_hf_model(model, dst_folder):
             else model.config.num_attention_heads
         )
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("transformer_h_", "layers_")
-                .replace("transformer_", "")
-                .replace("self_attention_dense", "attention_wo")
-            )
+            name = FlexFlowFalcon.convert_hf_weight_name(name)
             # Split Q,K,V attention weights
-            if "self_attention_query_key_value" in name:
-                name_q = name.replace("self_attention_query_key_value", "attention_wq")
-                name_k = name.replace("self_attention_query_key_value", "attention_wk")
-                name_v = name.replace("self_attention_query_key_value", "attention_wv")
+            if "self_attention.query_key_value" in name:
+                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
+                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
+                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -278,5 +281,5 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         # LM head weight
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 6b33030f62..96f0258572 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -62,7 +62,7 @@ def __init__(
         # self.llama_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
-        self.maxint = 2**31 - 1
+        self.maxint = 2 ** 31 - 1
         max_verify_tokens_per_batch = (
             max_tokens_per_batch + self.llama_config.max_spec_tree_token_num
         )
@@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="tok_embeddings",
+            name="embed_tokens",
         )
 
         for i in range(self.llama_config.num_hidden_layers):
@@ -117,7 +117,7 @@ def build_model(self, max_tokens_per_batch):
                     token,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, attn_norm = ffmodel.residual_rms_norm(
@@ -125,7 +125,7 @@ def build_model(self, max_tokens_per_batch):
                     w2,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -145,7 +145,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -193,21 +193,21 @@ def build_model(self, max_tokens_per_batch):
                 mha,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
-                name=f"layers_{i}_ffn_norm",
+                name=f"layers.{i}.post_attention_layernorm",
             )
             w1 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w1",
+                name=f"layers.{i}.mlp.gate_proj",
             )
             w3 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w3",
+                name=f"layers.{i}.mlp.up_proj",
             )
             multi = ffmodel.sigmoid_silu_multi(w1, w3)
             w2 = ffmodel.dense(
@@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w2",
+                name=f"layers.{i}.mlp.down_proj",
             )
 
         _, token = ffmodel.residual_rms_norm(
@@ -230,7 +230,7 @@ def build_model(self, max_tokens_per_batch):
             self.llama_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="output",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -246,28 +246,16 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(dense, 1, False)
-                output = ffmodel.argmax(dense, False)
+                softmax = ffmodel.softmax(dense, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return name.replace("model.", "")
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("self_attn", "attention")
-                .replace("q_proj", "wq")
-                .replace("k_proj", "wk")
-                .replace("v_proj", "wv")
-                .replace("o_proj", "wo")
-                .replace("mlp", "feed_forward")
-                .replace("gate_proj", "w1")
-                .replace("down_proj", "w2")
-                .replace("up_proj", "w3")
-                .replace("input_layernorm", "attention_norm")
-                .replace("post_attention_layernorm", "ffn_norm")
-                .replace("embed_tokens", "tok_embeddings")
-                .replace("lm_head", "output")
-                .replace("model_", "")
-            )
+            name = FlexFlowLLAMA.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 92867fd498..b350ae106d 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -97,7 +97,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
 
         axes = [
@@ -114,7 +114,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
             else:
                 hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -126,7 +126,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -148,7 +148,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention_verify(
@@ -169,7 +169,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention(
@@ -190,7 +190,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             else:
                 assert False
@@ -204,7 +204,7 @@ def build_model(self, max_tokens_per_batch):
                 True,
                 1e-05,
                 False,
-                name=f"layers_{i}_norm_2",
+                name=f"layers.{i}.norm_2",
             )
             # mlp
             layernorm_output = ffmodel.dense(
@@ -212,7 +212,7 @@ def build_model(self, max_tokens_per_batch):
                 4 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_up_proj",
+                name=f"layers.{i}.ffn.up_proj",
             )
             layernorm_output = ffmodel.gelu(layernorm_output)
             intermediate_output = ffmodel.dense(
@@ -220,7 +220,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_down_proj",
+                name=f"layers.{i}.ffn.down_proj",
             )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -232,7 +232,7 @@ def build_model(self, max_tokens_per_batch):
             True,
             1e-05,
             False,
-            name=f"transformer_norm_f",
+            name=f"norm_f",
         )
         lm_head = ffmodel.dense(
             all_final_norm,
@@ -249,18 +249,27 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("transformer.blocks.", "layers.")
+            .replace("transformer.", "")
+            .replace("attn.out_proj", "attn.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.blocks.", "layers.").replace(".", "_")
+            name = FlexFlowMPT.convert_hf_weight_name(name)
             if "Wqkv" in name:
-                name_q = name.replace("attn_Wqkv", "attention_wq")
-                name_k = name.replace("attn_Wqkv", "attention_wk")
-                name_v = name.replace("attn_Wqkv", "attention_wv")
+                name_q = name.replace("attn.Wqkv", "attn.q_proj")
+                name_k = name.replace("attn.Wqkv", "attn.k_proj")
+                name_v = name.replace("attn.Wqkv", "attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -273,13 +282,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "out_proj" in name:
-                name = name.replace("attn_out_proj", "attention_wo")
-                params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
         shutil.copy(
-            os.path.join(dst_folder, "transformer_wte_weight"),
-            os.path.join(dst_folder, "lm_head_weight"),
+            os.path.join(dst_folder, "wte.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index b715f5f35e..02668abf59 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_attention_layer_norm",
+                    name=f"layers.{i}.self_attn_layer_norm",
                 )
             else:
                 hidden_states = ffmodel.add(token, positional_embedding)
@@ -163,7 +163,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multihead_self_attention_verify(
@@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multihead_self_attention(
@@ -203,7 +203,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 self.opt_config.layer_norm_elementwise_affine,
                 1e-05,
-                name=f"layers_{i}_add_bias_residual_layer_norm",
+                name=f"layers.{i}.add_bias_residual_layer_norm",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -226,14 +226,14 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.ffn_dim,
                 ActiMode.AC_MODE_RELU,
                 True,
-                name=f"layers_{i}_fc1",
+                name=f"layers.{i}.fc1",
             )
             fc2 = ffmodel.dense(
                 fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_fc2",
+                name=f"layers.{i}.fc2",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -245,7 +245,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_final_layer_norm",
+                    name=f"layers.{i}.final_layer_norm",
                 )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -263,7 +263,7 @@ def build_model(self, max_tokens_per_batch):
             self.opt_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="embed_tokens_weight_lm_head",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -279,30 +279,29 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("decoder.", "")
+            .replace("model.", "")
+            .replace("self_attn.out_proj", "self_attn.o_proj")
+            .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias")
+            .replace(
+                ".final_layer_norm", ".add_bias_residual_layer_norm"
+            )  # important to use the leading "_" to avoid matching the last LayerNorm
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("decoder_", "")
-                .replace("model_", "")
-                .replace("self_attn", "attention")
-                .replace("q_proj", "wq")
-                .replace("k_proj", "wk")
-                .replace("v_proj", "wv")
-                .replace("out_proj", "wo")
-                .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias")
-                .replace(
-                    "_final_layer_norm", "_add_bias_residual_layer_norm"
-                )  # important to use the leading "_" to avoid matching the last LayerNorm
-            )
+            name = FlexFlowOPT.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
-            os.path.join(dst_folder, "embed_tokens_weight"),
-            os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
+            os.path.join(dst_folder, "embed_tokens.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 37edaa4c40..2d4471201f 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -111,7 +111,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
         positional_embedding = ffmodel.embedding(
             position_tensor,
@@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wpe",
+            name="wpe",
         )
 
         axes = [
@@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_1",
+                name=f"layers.{i}.ln_1",
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
@@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
                 False,  # apply_rotary_embedding
-                name=f"layers_{i}_attention",
+                name=f"layers.{i}.attn.c_attn",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -171,7 +171,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_2",
+                name=f"layers.{i}.ln_2",
             )
 
             # mlp
@@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_fc",
+                name=f"layers.{i}.mlp.c_fc",
             )
             activation = ffmodel.gelu(c_fc, False)
             c_proj = ffmodel.dense(
@@ -189,7 +189,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_proj",
+                name=f"layers.{i}.mlp.c_proj",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -200,7 +200,7 @@ def build_model(self, max_tokens_per_batch):
             axes,
             True,
             self.starcoder_config.layer_norm_epsilon,
-            name=f"transformer_ln_f",
+            name=f"ln_f",
         )
         lm_head = ffmodel.dense(
             ln_f,
@@ -217,18 +217,19 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace(".", "_")
-            if "c_attn_weight" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            name = name.replace("transformer.h", "layers").replace("transformer.", "")
+            if "attn.c_attn.weight" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -241,10 +242,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_attn_bias" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            elif "attn.c_attn.bias" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -257,14 +258,14 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_proj_bias" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.bias" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
-            elif "c_proj_weight" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.weight" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index ac622b3337..132c50995b 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -28,44 +28,38 @@
 )
 from flexflow.core import *
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
+from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
-import sys, torch, shutil, hashlib
+import torch, shutil, hashlib, json, gc
 from typing import Union, List
 
 
-class GenerationConfig:
-    """A class to store the sampling configs."""
-
-    def __init__(
-        self,
-        do_sample: bool = False,
-        temperature: float = 0.9,
-        topp: float = 0.8,
-        topk: int = 1,
-    ):
-        """Initialize the sampling configs
-
-        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
-        :type do_sample: bool, optional
-        :param temperature: The temperature setting, defaults to 0.9
-        :type temperature: float, optional
-        :param topp: The top probabilities (top-p) setting, defaults to 0.8
-        :type topp: float, optional
-        :param topk: The top-k setting, defaults to 1
-        :type topk: int, optional
-        """
-        self.do_sample = do_sample
-        self.temperature = temperature
-        self.topp = topp
-        self.topk = topk
-
-
-class GenerationResult:
-    """A class to store the output of a generation request."""
+class _SupportedModels:
+    def __init__(self,):
+        self.supported_models = {
+            "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
+            "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
+            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
+            "GPTBigCodeForCausalLM": (
+                ModelType.STARCODER,
+                FlexFlowSTARCODER,
+                STARCODERConfig,
+            ),
+            "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig),
+        }
 
-    def __init__(self, text: str = None, tokens: list = None):
-        self.output_text = text
-        self.output_tokens = tokens
+    def get_ff_model_type(self, hf_config):
+        architectures = getattr(hf_config, "architectures", [])
+        ff_arch = None
+        if next(iter(architectures), None) is not None:
+            ff_arch = self.supported_models.get(architectures[0])
+        if ff_arch is None:
+            raise ValueError(
+                f"Huggingface model of type {architectures} is not yet supported by FlexFlow"
+            )
+        return ff_arch
 
 
 class LLM:
@@ -92,68 +86,117 @@ def __init__(
         :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to ""
         :type output_file: str, optional
         """
-        self.supported_models = {
-            "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
-            "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
-            "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
-            "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
-            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
-            "GPTBigCodeForCausalLM": (
-                ModelType.STARCODER,
-                FlexFlowSTARCODER,
-                STARCODERConfig,
-            ),
-            "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig),
-        }
+        self.supported_models = _SupportedModels()
         self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         self.model_name = self.hf_config._name_or_path
         (
             self.model_type,
             self.model_class,
             self.config_class,
-        ) = self.__get_ff_model_type()
+        ) = self.supported_models.get_ff_model_type(self.hf_config)
         self.data_type = data_type
         assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         self.output_file = output_file
         self.rm = None
+        self.pefts = {}
 
     def __del__(self):
         # Stop the background server before deleting the object
         if type(self) == LLM and self.rm is not None:
             self.rm.stop_server()
 
-    def __get_ff_model_type(self):
-        architectures = getattr(self.hf_config, "architectures", [])
-        ff_arch = None
-        if next(iter(architectures), None) is not None:
-            ff_arch = self.supported_models.get(architectures[0])
-        if ff_arch is None:
-            print(
-                f"Huggingface model of type {architectures} is not yet supported by FlexFlow"
+    def add_peft(self, lora_config: LoraLinearConfig):
+        """Add a PEFT adapter to the LLM"""
+        if lora_config is None:
+            raise ValueError("lora_config cannot be None")
+        if len(lora_config.peft_model_id or "") == 0:
+            raise ValueError("PEFT model id cannot be empty")
+        # Inference (trainable=False): LoRA model should already exist in huggingface. Any changes of parameters from original model are ignored
+        # Training (trainable=True): Either an existing model (init_lora_weights=False) or a new one (init_lora_weights=True)
+
+        if lora_config.trainable == False or not lora_config.init_lora_weights:
+            peft_config = PeftConfig.from_pretrained(lora_config.peft_model_id)
+        else:
+            peft_config = LoraConfig(
+                peft_type="LORA",
+                base_model_name_or_path=self.model_name,
+                r=lora_config.rank,
+                target_modules=lora_config.target_modules,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                init_lora_weights=lora_config.init_lora_weights,
             )
-            sys.exit(1)
-        return ff_arch
+        if peft_config.peft_type != "LORA":
+            raise RuntimeError(
+                f"PEFT type {peft_config.peft_type} not yet supported in FlexFlow"
+            )
+        if "base_model_name_or_path" not in peft_config.to_dict():
+            raise ValueError(
+                f"PEFT model {lora_config.peft_model_id} does not have an associated base model"
+            )
+        if peft_config.base_model_name_or_path != self.model_name:
+            raise RuntimeError(
+                f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
+            )
+
+        self.pefts[lora_config] = {
+            "peft_config": peft_config,
+            "peft_type": peft_config.peft_type,
+        }
+
+    def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID:
+        if lora_config is None:
+            raise ValueError("lora_config cannot be None")
+        if len(lora_config.peft_model_id or "") == 0:
+            raise ValueError("PEFT model id cannot be empty")
+        if lora_config not in self.pefts:
+            raise ValueError(
+                f"PEFT {lora_config} not registered with LLM {self.model_name}"
+            )
+        if "ff_peft_model_id" not in self.pefts[lora_config]:
+            raise RuntimeError(
+                f"Attempting to run PEFT {lora_config} before compiling LLM {self.model_name}"
+            )
+
+        return self.pefts[lora_config]["ff_peft_model_id"]
 
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
+        config_dir = os.path.join(
             os.path.expanduser(self.cache_path), "configs", self.model_name.lower()
         )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.model_name} configs to file {self.config_path}...")
-        self.hf_config.to_json_file(self.config_path)
+        config_path = os.path.join(config_dir, "config.json")
+        os.makedirs(config_dir, exist_ok=True)
+        print(f"Creating directory {config_dir} (if it doesn't exist)...")
+        print(f"Saving {self.model_name} configs to file {config_path}...")
+        self.hf_config.to_json_file(config_path)
+
+        # Save PEFT configs if the LLM has any registered PEFTs
+        for ff_peft_config, peft_dict in self.pefts.items():
+            peft_config = peft_dict["peft_config"]
+            peft_model_id = ff_peft_config.peft_model_id
+            peft_config_dir = os.path.join(
+                os.path.expanduser(self.cache_path), "configs", peft_model_id.lower()
+            )
+            os.makedirs(peft_config_dir, exist_ok=True)
+            peft_config_path = os.path.join(peft_config_dir, "config.json")
+            print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
+            with open(peft_config_path, "w") as json_file:
+
+                class SetEncoder(json.JSONEncoder):
+                    def default(self, obj):
+                        if isinstance(obj, set):
+                            return list(obj)
+                        return super().default(obj)
 
-    def __get_revision_hashes(self, model_name: str, weights: bool):
+                json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+
+    def __get_revision_hashes(self, model_name: str, folder: str):
         ff_revision = None
-        ff_revision_file = (
-            os.path.join(self.weights_path, "rev_sha.txt")
-            if weights
-            else os.path.join(self.tokenizer_path, "rev_sha.txt")
-        )
+        ff_revision_file = os.path.join(folder, "rev_sha.txt")
+
         if os.path.exists(ff_revision_file):
             ff_revision = "".join(open(ff_revision_file).read().split())
 
@@ -173,65 +216,109 @@ def __get_revision_hashes(self, model_name: str, weights: bool):
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
+
+        If any PEFT adapter is registered, perform the same operation for PEFT.
         """
-        if self.data_type == DataType.DT_HALF:
-            torch.set_default_tensor_type(torch.HalfTensor)
-        elif self.data_type == DataType.DT_FLOAT:
-            torch.set_default_tensor_type(torch.FloatTensor)
-        else:
-            assert False, "Data type not yet supported -- cannot download weights!"
 
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.model_name.lower(),
-            (
-                "full-precision"
-                if self.data_type == DataType.DT_FLOAT
-                else "half-precision"
-            ),
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..."
+        def get_weights_path(model_name):
+            return os.path.join(
+                os.path.expanduser(self.cache_path),
+                "weights",
+                model_name.lower(),
+                (
+                    "full-precision"
+                    if self.data_type == DataType.DT_FLOAT
+                    else "half-precision"
+                ),
             )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=True
-        )
-
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
+        def refresh_cache_if_needed(model_name):
+            weights_path = get_weights_path(model_name)
+            if self.refresh_cache:
                 print(
-                    f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
+                    f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
                 )
-            else:
-                # Remote model
+                if os.path.exists(weights_path):
+                    shutil.rmtree(weights_path)
+            os.makedirs(weights_path, exist_ok=True)
+
+        def get_hf_llm(model_name):
+            return AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
+            )
+
+        def download_llm_weights():
+            refresh_cache_if_needed(self.model_name)
+            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+                self.model_name, self.weights_path
+            )
+            if ff_revision != latest_revision:
                 print(
-                    f"'{self.model_name}' local model weights were updated! Converting new weights now..."
+                    f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
                 )
-            # Download model from HuggingFace, or load it from the local folder
-            hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, trust_remote_code=True
-            )
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.model_class.convert_hf_model(hf_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-        else:
-            print(f"Loading '{self.model_name}' model weights from the cache...")
+                hf_model = get_hf_llm(self.model_name)
+                # Convert the model to FlexFlow format
+                self.model_class.convert_hf_model(hf_model, self.weights_path)
+                # Save new revision hash to file
+                with open(ff_revision_file, "w+") as f:
+                    f.write(latest_revision)
+                print(f"Done converting the weights for model {self.model_name}")
+                # Deallocate hf model
+                del hf_model
+                gc.collect()
+                torch.cuda.empty_cache()
+
+        def convert_peft_model(hf_peft_model, peft_type, weights_path):
+            for name, params in hf_peft_model.named_parameters():
+                if peft_type.lower() in name:
+                    name = name.replace("base_model.model.model.", "").replace(
+                        ".default", ""
+                    )
+                    name = self.model_class.convert_hf_weight_name(name)
+                    params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+
+        def download_peft_weights():
+            for ff_peft_config, peft_dict in self.pefts.items():
+                if not ff_peft_config.init_lora_weights:
+                    peft_config = peft_dict["peft_config"]
+                    peft_type = peft_dict["peft_type"]
+                    peft_model_id = ff_peft_config.peft_model_id
+
+                    weights_path = get_weights_path(peft_model_id)
+                    refresh_cache_if_needed(peft_model_id)
+                    ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+                        peft_model_id, weights_path
+                    )
+
+                    if ff_revision != latest_revision:
+                        print(
+                            f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
+                        )
+                        hf_model = get_hf_llm(peft_model_id)
+                        hf_peft_model = PeftModel.from_pretrained(
+                            hf_model, peft_model_id, config=peft_config
+                        )
+                        # Convert the model to FlexFlow format
+                        convert_peft_model(hf_peft_model, peft_type, weights_path)
+                        # Save new revision hash to file
+                        with open(ff_revision_file, "w+") as f:
+                            f.write(latest_revision)
+                        print(f"Done converting the weights for model {peft_model_id}")
+                        # Deallocate hf model
+                        del hf_peft_model
+                        del hf_model
+                        gc.collect()
+                        torch.cuda.empty_cache()
+
+        self.weights_path = get_weights_path(self.model_name)
+        download_llm_weights()
+        download_peft_weights()
 
     def download_hf_tokenizer_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date.
@@ -241,13 +328,11 @@ def download_hf_tokenizer_if_needed(self):
 
         # Use local cache, or download new version
         self.tokenizer_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "tokenizers",
-            self.model_name.lower(),
+            os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower()
         )
         if self.refresh_cache:
             print(
-                f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..."
+                f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
             )
             if os.path.exists(self.tokenizer_path):
                 shutil.rmtree(self.tokenizer_path)
@@ -257,46 +342,29 @@ def download_hf_tokenizer_if_needed(self):
 
         # Get local revision SHA, check if it matches latest one on huggingface
         ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=False
+            self.model_name, self.tokenizer_path
         )
 
         if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
-                print(
-                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
-                )
+            print(
+                f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
+            )
             # Download tokenizer from HuggingFace, or load it from the local folder
-            if self.model_type == ModelType.LLAMA:
-                hf_tokenizer = LlamaTokenizer.from_pretrained(
-                    self.model_name, use_fast=True
-                )
-            else:
-                hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # Print log message to notify user download of tokenizer has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading tokenizer. Saving it now...")
+            hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             # Save tokenizer
             hf_tokenizer.save_pretrained(self.tokenizer_path)
-            print("Done saving HF tokenizer.")
+            print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
 
-        else:
-            print(f"Loading '{self.model_name}' tokenizer from the cache...")
-
     def compile(
         self,
         generation_config: GenerationConfig = GenerationConfig(),
         max_requests_per_batch: int = 1,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 64,
+        enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = None,
         model_specific_tensor_parallelism_degree: int = None,
         model_specific_pipeline_parallelism_degree: int = None,
@@ -312,6 +380,8 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64
         :type max_tokens_per_batch: int, optional
+        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
+        :type enable_peft_finetuning: bool, optional
         :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None
         :type model_specific_data_parallelism_degree: int, optional
         :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None
@@ -321,9 +391,6 @@ def compile(
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
-        # self.max_requests_per_batch = max_requests_per_batch
-        # self.max_seq_length = max_seq_length
-        # self.max_tokens_per_batch = max_tokens_per_batch
         self.ssms = ssms
         self.generation_config = GenerationConfig()
         self.ffconfig = FFConfig()
@@ -355,6 +422,7 @@ def compile(
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
         self.rm.set_max_tokens_per_batch(max_tokens_per_batch)
         self.rm.set_max_sequence_length(max_seq_length)
+        self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
 
         # Instantiate the relevant model
         self.model = self.model_class(
@@ -366,16 +434,27 @@ def compile(
             max_tokens_per_batch,
         )
 
+        # Download the config from huggingface
+        self.download_hf_config()
+
+        # Download the tokenizer from huggingface (if needed) and load them
+        self.download_hf_tokenizer_if_needed()
+
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
+        # Add PEFT layer if registered
+        for ff_peft_config, peft_dict in self.pefts.items():
+            ff_peft_config.ff_compile()
+            ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
+            peft_dict["ff_peft_model_id"] = ff_peft_model_id
+
         # Create file data loader, load weights into tensors
         model_configs = self.config_class(self.hf_config)
 
         self.rm.set_max_spec_tree_token_num(
             model_configs.max_spec_tree_token_num
-            if "max_spec_tree_token_num"
-            in model_configs.__dict__
+            if "max_spec_tree_token_num" in model_configs.__dict__
             else 20
         )
 
@@ -393,9 +472,6 @@ def compile(
         self.im = InferenceManager()
         self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader)
 
-        # Download the tokenizer from huggingface (if needed) and load them
-        self.download_hf_tokenizer_if_needed()
-
         # Create tokenizer (this must be done after we have downloaded the tokenizer
         bos_token_id = (
             -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
@@ -419,22 +495,36 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
+    def generate(
+        self,
+        requests_or_prompts: Union[str, List[str], Request, List[Request]],
+        max_length: int = 128,
+    ):
         """Generate tokens based on the input prompt(s)
 
-        :param prompts: The generation prompt(s) in the form of a string, or list of strings
-        :type prompts: Union[str, List[str]]
+        :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
+        :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
         :return: the generation results
         :rtype: GenerationResult
         """
-        if type(prompts) == str:
-            if len(prompts) == 0:
+        if type(requests_or_prompts) == str:
+            if len(requests_or_prompts) == 0:
                 return None
-            return self.model.ffmodel.generate([prompts], max_length)
-        elif type(prompts) == list:
-            if len(prompts) == 0:
+            return self.model.ffmodel.generate_inf_only(
+                [requests_or_prompts], max_length
+            )
+        elif type(requests_or_prompts) == Request:
+            return self.model.ffmodel.generate(requests_or_prompts)
+        elif type(requests_or_prompts) == list:
+            if len(requests_or_prompts) == 0:
                 return []
-            return self.model.ffmodel.generate(prompts, max_length)
+            if type(requests_or_prompts[0]) == str:
+                return self.model.ffmodel.generate_inf_only(
+                    requests_or_prompts, max_length
+                )
+            else:
+                print(requests_or_prompts)
+                return self.model.ffmodel.generate(requests_or_prompts)
         else:
             assert False, "Please pass a non-empty string or list of strings"
 
@@ -446,17 +536,6 @@ def stop_server(self):
         self.rm.stop_server()
         print("Background server stopped.")
 
-    def __enter__(self):
-        # Start the server when entering the context
-        # self.rm.start_server(self.model.ffmodel)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        # Stop the server when exiting the context
-        # self.rm.stop_server()
-        if exc_type:
-            print(f"Exception occurred: {exc_value}")
-
 
 class SSM(LLM):
     """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace"""
@@ -482,13 +561,7 @@ def __init__(
         :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to ""
         :type output_file: str, optional
         """
-        super().__init__(
-            model_name,
-            data_type,
-            cache_path,
-            refresh_cache,
-            output_file,
-        )
+        super().__init__(model_name, data_type, cache_path, refresh_cache, output_file)
 
     def compile(
         self,
@@ -496,15 +569,13 @@ def compile(
         max_requests_per_batch: int = 16,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 128,
+        enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = 1,
         model_specific_tensor_parallelism_degree: int = 1,
         model_specific_pipeline_parallelism_degree: int = 1,
         ssms: list = [],
     ):
         """Compile the SSM for inference and load the weights into memory
-
-        :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE
-        :type mode: InferenceMode, optional
         :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig()
         :type generation_config: GenerationConfig, optional
         :param max_requests_per_batch: The maximum batch size to allow, defaults to 16
@@ -513,6 +584,8 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128
         :type max_tokens_per_batch: int, optional
+        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
+        :type enable_peft_finetuning: bool, optional
         :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1
         :type model_specific_data_parallelism_degree: int, optional
         :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1
@@ -527,6 +600,7 @@ def compile(
             max_requests_per_batch,
             max_seq_length,
             max_tokens_per_batch,
+            enable_peft_finetuning,
             model_specific_data_parallelism_degree,
             model_specific_tensor_parallelism_degree,
             model_specific_pipeline_parallelism_degree,
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 994a85f57e..0f4726837c 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -46,6 +46,12 @@ class LossType(Enum):
     LOSS_IDENTITY = 54
 
 
+class OptimizerType(Enum):
+    OPTIMIZER_TYPE_NONE = 60
+    OPTIMIZER_TYPE_SGD = 61
+    OPTIMIZER_TYPE_ADAM = 62
+
+
 class CompMode(Enum):
     TRAINING = 70
     INFERENCE = 71
@@ -153,6 +159,11 @@ class OpType(Enum):
     RESIDUAL_LAYERNORM = 2306
 
 
+class RequestType(Enum):
+    REQ_INFERENCE = 4001
+    REQ_FINETUNING = 4002
+
+
 def enum_to_int(enum, enum_item):
     for item in enum:
         if enum_item == item:
diff --git a/rdelacou/generate_trace.py b/rdelacou/generate_trace.py
new file mode 100644
index 0000000000..986dab37df
--- /dev/null
+++ b/rdelacou/generate_trace.py
@@ -0,0 +1,121 @@
+import pandas as pd
+from math import ceil
+from random import shuffle, uniform
+import json, pickle, requests, os, argparse
+
+class TraceBuilder(object):
+
+  # trace_type: either "conv" or "code"
+  def __init__(self, import_times=True, import_prompts=True):
+    self.req_times = None
+    self.imported_req_times = False
+    self.prompt_data = None
+    self.imported_prompt_data = False
+    if import_times:
+      self.import_trace_timestamps()
+    if import_prompts:
+      self.import_prompt_data()
+
+  def import_trace_timestamps(self, trace_type="conv"):
+    if not self.imported_req_times:
+      # Import Microsoft LLM 1 hour trace
+      df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"])
+      req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds
+      req_times = req_times - req_times.min()
+      self.req_times = req_times.tolist()
+      self.imported_req_times = True
+  
+  def import_prompt_data(self, shuffle_=True):
+    if not self.imported_prompt_data:
+      sharegpt_filename = "sharegpt_opt_text_completion_length.pkl"
+      sharegpt_filepath = f"./{sharegpt_filename}"
+      if os.path.exists(sharegpt_filepath):
+        os.remove("sharegpt_opt_text_completion_length.pkl")
+      sharegpt_url = f"https://github.com/sosp-ae-39/sosp-ae-astra/raw/main/datasets/{sharegpt_filename}"
+      response = requests.get(sharegpt_url)
+      with open(sharegpt_filename, "wb") as file:
+        file.write(response.content)
+      with open(sharegpt_filepath, 'rb') as f:
+        data2 = pickle.load(f)
+      os.remove("sharegpt_opt_text_completion_length.pkl")
+
+      prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+
+      for pair in data2:
+        assert(len(pair) == 2)
+
+      prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      num_pairs = len(prompt_lengths)
+      assert(num_pairs == len(generation_lengths))
+      print("Number of conversation pairs: ", num_pairs)
+
+      print(f"Prompt lengths: min={min(prompt_lengths)}, max={max(prompt_lengths)}, avg={sum(prompt_lengths)/len(prompt_lengths)}")
+      print(f"Generation lengths: min={min(generation_lengths)}, max={max(generation_lengths)}, avg={sum(generation_lengths)/len(generation_lengths)}")
+      total_lengths = [prompt_lengths[i] + generation_lengths[i] for i in range(len(prompt_lengths))]
+      print(f"Total lengths: min={min(total_lengths)}, max={max(total_lengths)}, avg={sum(total_lengths)/len(total_lengths)}")
+
+      self.prompt_data = [{"human": prompt_lengths[i], "gpt": generation_lengths[i]} for i in range(num_pairs)]
+        
+      if shuffle_:
+        shuffle(self.prompt_data)
+      self.imported_prompt_data = True
+
+  # Delta is in seconds
+  # Rate is in req per second
+  def generate_trace(self, target_arrival_rate=10, debug_verbose=False):
+    self.import_trace_timestamps()
+    self.import_prompt_data()
+
+    microsec = 1000000
+    avg_arrival_rate = len(self.req_times) / (self.req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude
+    if debug_verbose:
+      print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate)
+    scale_factor = float(target_arrival_rate) / avg_arrival_rate
+    if debug_verbose:
+      print("Scale factor to obtain target arrival rate: ", scale_factor)
+
+    # Buckets are 1 second timeframes
+    nb_buckets = ceil(self.req_times[-1] / microsec)
+    buckets = []
+    j = 0
+    k = 0
+    for i in range(nb_buckets):
+      bucket_size = 0
+      while(j < len(self.req_times) and self.req_times[j] >= i*microsec and self.req_times[j] < (i+1)*microsec):
+        bucket_size += 1
+        j += 1
+      bucket_size = bucket_size*scale_factor
+      prob = bucket_size - int(bucket_size)
+      bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob)
+      
+      # If used all of the prompt data, loop back at the beggining and reuse some prompts
+      if k+bucket_size > len(self.prompt_data):
+        bucket = self.prompt_data[k:] + self.prompt_data[:(k+bucket_size)%len(self.prompt_data)]
+      else:
+        bucket = self.prompt_data[k:k+bucket_size]
+      k = (k+bucket_size) % len(self.prompt_data)
+      buckets.append(bucket)
+
+    if debug_verbose:
+      print("Avg arrival rate obtained (req/s): ", sum([len(b) for b in buckets])/len(buckets))
+    return buckets
+
+def generate_and_save_trace(arrival_rate, output_file):
+  builder = TraceBuilder()
+  trace = builder.generate_trace(target_arrival_rate=arrival_rate, debug_verbose=True)
+  with open(output_file, 'w+') as f:
+    json.dump(trace, f, indent=2)
+
+if __name__ == '__main__':
+  # Set up the argument parser
+  parser = argparse.ArgumentParser(description='Generate and save a trace.')
+  parser.add_argument('--arrival-rate', type=float, default=10.0, help='The target arrival rate for the trace.')
+  parser.add_argument('--output-file', type=str, default='sharegpt.json', help='The path to the output file to save the trace.')
+
+  # Parse the command-line arguments
+  args = parser.parse_args()
+
+  # Call the function with the user-provided arrival rate
+  generate_and_save_trace(args.arrival_rate, args.output_file)
diff --git a/requirements.txt b/requirements.txt
index ad65622367..64f1808934 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,11 @@ transformers>=4.31.0
 sentencepiece
 einops
 pip
+# peft-related
+scipy
+bitsandbytes 
+datasets 
+accelerate 
+loralib
+triton
+peft
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 5714c8fe3d..e39cb29037 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -67,6 +67,13 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *);
+  // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_optimizer_config_t, LoraOptimizerConfig
+  // *); FF_NEW_OPAQUE_WRAPPER(flexflow_lora_sgd_optimizer_config_t,
+  //                       LoraSGDOptimizerConfig *);
+  // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_adam_optimizer_config_t,
+  //                       LoraAdamOptimizerConfig *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
 };
 
 Logger ffc_log("flexflow_c");
@@ -649,6 +656,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   const Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -672,6 +680,7 @@ flexflow_tensor_t *
                               elementwise_affine,
                               eps,
                               use_bias,
+                              inplace_residual,
                               input->data_type,
                               name);
   assert(tensor_outputs[0] != nullptr);
@@ -679,7 +688,7 @@ flexflow_tensor_t *
   DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 "
               "%p, output0: %p, "
               "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps "
-              "%f, use_bias: %d, name %s",
+              "%f, use_bias: %d, inplace_residual: %d, name %s",
               input,
               residual1,
               residual2,
@@ -689,6 +698,7 @@ flexflow_tensor_t *
               elementwise_affine,
               eps,
               use_bias,
+              inplace_residual,
               name);
   flexflow_tensor_t *tensor_outputs_wrapped =
       (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t));
@@ -706,6 +716,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   const Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -722,13 +733,14 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
                                        elementwise_affine,
                                        eps,
                                        use_bias,
+                                       inplace_residual,
                                        input->data_type,
                                        name);
   assert(tensor_outputs[0] != nullptr);
   assert(tensor_outputs[1] != nullptr);
   DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, "
               "output1: %p, elementwise_affine %d, eps "
-              "%f, use_bias %d, name %s",
+              "%f, use_bias %d, inplace_residual: %d, name %s",
               input,
               residual,
               tensor_outputs[0],
@@ -736,6 +748,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
               elementwise_affine,
               eps,
               use_bias,
+              inplace_residual,
               name);
   flexflow_tensor_t *tensor_outputs_wrapped =
       (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t));
@@ -1469,13 +1482,20 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input1 = FFCObjectWrapper::unwrap(input1_);
   Tensor input2 = FFCObjectWrapper::unwrap(input2_);
   Tensor tensor_outputs[2];
-  handle->residual_rms_norm(
-      input1, input2, tensor_outputs, eps, dim, input1->data_type, name);
+  handle->residual_rms_norm(input1,
+                            input2,
+                            tensor_outputs,
+                            eps,
+                            dim,
+                            inplace_residual,
+                            input1->data_type,
+                            name);
   assert(tensor_outputs[0] != nullptr);
   assert(tensor_outputs[1] != nullptr);
   flexflow_tensor_t *tensor_outputs_wrapped =
@@ -1529,6 +1549,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_,
+    const flexflow_lora_linear_config_t peft_config_) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
+  PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config);
+
+  DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, "
+              "peft_model_id: %p",
+              handle,
+              peft_config,
+              peft_model_id);
+  return FFCObjectWrapper::wrap(peft_model_id);
+}
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_,
                                       flexflow_sgd_optimizer_t optimizer_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1584,39 +1619,83 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
+                             enum RequestType *request_types,
                              char const **input_texts,
-                             int max_num_chars,
                              char **output_texts,
-                             int max_seq_length,
-                             int **output_length_and_tokens) {
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
+                             int **output_length_and_tokens,
+                             int *num_finetuning_losses,
+                             float *finetuning_losses) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  std::vector<std::string> prompts;
+  std::vector<Request> requests;
+
   for (int i = 0; i < num_requests; i++) {
-    std::string const text_str(input_texts[i]);
-    prompts.push_back(text_str);
-    DEBUG_PRINT("[Model] generate[%d] %p %s %i",
-                i,
-                handle,
-                text_str.c_str(),
-                max_seq_length);
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      std::string const text_str(input_texts[i]);
+      Request inference_req;
+      inference_req.prompt = text_str;
+      inference_req.max_sequence_length = max_seq_lengths[i];
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
+        inference_req.peft_model_id = *peft_model_id;
+      }
+      requests.push_back(inference_req);
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i",
+                  i,
+                  handle,
+                  text_str.c_str(),
+                  max_seq_lengths[i]);
+    } else if (request_types[i] == RequestType::REQ_FINETUNING) {
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = max_seq_lengths[i];
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
+        fine_tuning_req.peft_model_id = *peft_model_id;
+      }
+      std::string const dataset_fp(dataset_filepaths[i]);
+      fine_tuning_req.dataset_filepath = dataset_fp;
+      fine_tuning_req.max_training_steps = training_steps[i];
+      requests.push_back(fine_tuning_req);
+      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i",
+                  i,
+                  handle,
+                  dataset_fp.c_str(),
+                  max_seq_lengths[i],
+                  training_steps[i]);
+    } else {
+      assert(false && "Unknown request type");
+    }
   }
-  std::vector<GenerationResult> results =
-      handle->generate(prompts, max_seq_length);
-  // If the prompt exceeds max seq len, check that we return the prompt with no
-  // additional token. Otherwise, check that the output does not exceed the max
-  // sequence length.
+
+  std::vector<GenerationResult> results = handle->generate(requests);
+
   for (int i = 0; i < num_requests; i++) {
-    assert(results[i].output_tokens.size() <= max_seq_length ||
-           results[i].output_tokens.size() == results[i].input_tokens.size());
-    output_length_and_tokens[i][0] = results[i].output_tokens.size();
-    std::copy(results[i].output_tokens.begin(),
-              results[i].output_tokens.end(),
-              output_length_and_tokens[i] + 1);
-    std::memcpy(output_texts[i],
-                results[i].output_text.c_str(),
-                results[i].output_text.length());
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      // If the prompt exceeds max seq len, check that we return the prompt with
+      // no additional token. Otherwise, check that the output does not exceed
+      // the max sequence length.
+      assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
+             results[i].output_tokens.size() == results[i].input_tokens.size());
+      output_length_and_tokens[i][0] = results[i].output_tokens.size();
+      std::copy(results[i].output_tokens.begin(),
+                results[i].output_tokens.end(),
+                output_length_and_tokens[i] + 1);
+      std::memcpy(output_texts[i],
+                  results[i].output_text.c_str(),
+                  results[i].output_text.length());
+    } else if (request_types[i] == RequestType::REQ_FINETUNING) {
+      assert(results[i].finetuning_losses.size() > 0);
+      *num_finetuning_losses = results[i].finetuning_losses.size();
+      // *finetuning_losses = results[i].finetuning_losses.data();
+      std::memcpy(finetuning_losses,
+                  results[i].finetuning_losses.data(),
+                  results[i].finetuning_losses.size() * sizeof(float));
+    }
   }
-  // return FFCObjectWrapper::wrap(&results[0]);
 }
 
 void flexflow_model_set_position_offset(flexflow_model_t handle_,
@@ -2597,6 +2676,14 @@ void flexflow_request_manager_set_max_sequence_length(
   DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length);
 }
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_enable_peft_finetuning(enable_peft_finetuning_);
+  DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d",
+              enable_peft_finetuning_);
+}
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -2730,3 +2817,238 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
   handle->load_weights(model);
 }
+
+// // -----------------------------------------------------------------------
+// // LoraSGDOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_sgd_optimizer_config_t
+// flexflow_lora_sgd_optimizer_config_create(
+//     double lr, double momentum, bool nesterov, bool weight_decay) {
+//   LoraSGDOptimizerConfig *handle =
+//       new LoraSGDOptimizerConfig(lr, momentum, nesterov, weight_decay);
+//   DEBUG_PRINT("[LoraSGDOptimizerConfig] new %p", handle);
+//   return FFCObjectWrapper::wrap(handle);
+// }
+
+// void flexflow_lora_sgd_optimizer_config_destroy(
+//     flexflow_lora_sgd_optimizer_config_t handle_) {
+//   LoraSGDOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[LoraSGDOptimizerConfig] delete %p", handle);
+//   delete handle;
+// }
+
+// // -----------------------------------------------------------------------
+// // LoraAdamOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_adam_optimizer_config_t
+//     flexflow_lora_adam_optimizer_config_create(double alpha,
+//                                                double beta1,
+//                                                double beta2,
+//                                                double weight_decay,
+//                                                double epsilon) {
+//   LoraAdamOptimizerConfig *handle =
+//       new LoraAdamOptimizerConfig(alpha, beta1, beta2, weight_decay,
+//       epsilon);
+//   DEBUG_PRINT("[LoraAdamOptimizerConfig] new %p", handle);
+//   return FFCObjectWrapper::wrap(handle);
+// }
+
+// void flexflow_lora_adam_optimizer_config_destroy(
+//     flexflow_lora_adam_optimizer_config_t handle_) {
+//   LoraAdamOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[LoraAdamOptimizerConfig] delete %p", handle);
+//   delete handle;
+// }
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_,
+                                       bool trainable,
+                                       bool init_lora_weights,
+                                       char const *base_model_name_or_path_,
+                                       char const *precision_,
+                                       int rank,
+                                       float lora_alpha,
+                                       float lora_dropout,
+                                       int num_target_modules,
+                                       char const **target_modules_,
+                                       enum OptimizerType optimizer_type,
+                                       float sgd_learning_rate,
+                                       float sgd_momentum,
+                                       bool sgd_nesterov,
+                                       float sgd_weight_decay,
+                                       float adam_alpha,
+                                       float adam_beta1,
+                                       float adam_beta2,
+                                       float adam_weight_decay,
+                                       float adam_epsilon) {
+  assert(cache_folder_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(peft_model_id_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(base_model_name_or_path_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(precision_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  std::string const cache_folder(cache_folder_);
+  std::string const peft_model_id(peft_model_id_);
+  LoraOptimizerConfig *optim_config = nullptr;
+  if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_SGD) {
+    optim_config = new LoraSGDOptimizerConfig(
+        sgd_learning_rate, sgd_momentum, sgd_nesterov, sgd_weight_decay);
+  } else if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_ADAM) {
+    optim_config = new LoraAdamOptimizerConfig(
+        adam_alpha, adam_beta1, adam_beta2, adam_weight_decay, adam_epsilon);
+  }
+  std::vector<std::string> target_modules;
+  for (int i = 0; i < num_target_modules; i++) {
+    std::string const target_module(target_modules_[i]);
+    target_modules.push_back(target_module);
+  }
+  std::string const base_model_name_or_path(base_model_name_or_path_);
+  std::string const precision(precision_);
+  LoraLinearConfig *handle = new LoraLinearConfig(cache_folder,
+                                                  peft_model_id,
+                                                  trainable,
+                                                  optim_config,
+                                                  init_lora_weights,
+                                                  base_model_name_or_path,
+                                                  precision,
+                                                  rank,
+                                                  lora_alpha,
+                                                  lora_dropout,
+                                                  target_modules);
+  DEBUG_PRINT("[LoraLinearConfig] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_lora_linear_config_destroy(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config);
+  delete peft_config;
+}
+
+char const *flexflow_lora_linear_config_get_cache_folder(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->cache_folder.c_str();
+}
+
+char const *flexflow_lora_linear_config_get_peft_model_id(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->peft_model_id.c_str();
+}
+
+int flexflow_lora_linear_config_get_rank(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->rank;
+}
+
+float flexflow_lora_linear_config_get_lora_alpha(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->lora_alpha;
+}
+
+float flexflow_lora_linear_config_get_lora_dropout(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->lora_dropout;
+}
+
+bool flexflow_lora_linear_config_get_trainable(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->trainable;
+}
+
+bool flexflow_lora_linear_config_get_init_lora_weights(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->init_lora_weights;
+}
+
+char const **flexflow_lora_linear_config_get_target_modules(
+    flexflow_lora_linear_config_t handle_, int *num_target_modules) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  *num_target_modules = handle->target_modules.size();
+  static std::vector<char const *> target_modules_;
+  target_modules_.clear();
+  for (auto const &target_module : handle->target_modules) {
+    target_modules_.push_back(target_module.c_str());
+  }
+  return target_modules_.data();
+}
+
+char const *flexflow_lora_linear_config_get_base_model_name_or_path(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->base_model_name_or_path.c_str();
+}
+
+char const *flexflow_lora_linear_config_get_precision(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->precision.c_str();
+}
+
+void flexflow_lora_linear_config_set_lora_alpha(
+    flexflow_lora_linear_config_t handle_, float value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->lora_alpha = value;
+}
+
+void flexflow_lora_linear_config_set_lora_dropout(
+    flexflow_lora_linear_config_t handle_, float value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->lora_dropout = value;
+}
+
+void flexflow_lora_linear_config_set_trainable(
+    flexflow_lora_linear_config_t handle_, bool value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->trainable = value;
+}
+
+void flexflow_lora_linear_config_set_init_lora_weights(
+    flexflow_lora_linear_config_t handle_, bool value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->init_lora_weights = value;
+}
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create() {
+  PEFTModelID *handle = new PEFTModelID();
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
+  PEFTModelID *handle = new PEFTModelID(id);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id() {
+  PEFTModelID *handle = const_cast<PEFTModelID *>(&PEFTModelID::NO_ID);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
+  PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);
+  delete peft_model_id;
+}
diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp
index a87aaade84..99c13f5a67 100644
--- a/src/loss_functions/loss_functions.cpp
+++ b/src/loss_functions/loss_functions.cpp
@@ -86,7 +86,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper(
                      num_classes,
                      k);
   // Scale logit gradients by op->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(logit_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
@@ -116,7 +116,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
                      label_ptr,
                      logit_volume);
   // Scale logit gradients by loss->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(logit_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
@@ -146,7 +146,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
                      label_ptr,
                      logit_volume);
   // Scale logit gradients by loss->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(logit_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
@@ -173,7 +173,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
                      loss_ptr,
                      loss_volume);
   // Scale logit gradients by loss->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(loss_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu
index f78311980c..636ef9c4c3 100644
--- a/src/loss_functions/loss_functions.cu
+++ b/src/loss_functions/loss_functions.cu
@@ -81,7 +81,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper(
       logit_grad_ptr, label_ptr, num_samples, num_classes, k);
   // Scale logit gradients by op->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor * k);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k);
 }
 
 void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
@@ -100,7 +100,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
       logit_grad_ptr, logit_ptr, label_ptr, logit_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor);
 }
 
 void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
@@ -119,7 +119,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
       logit_grad_ptr, logit_ptr, label_ptr, logit_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor);
 }
 
 void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
@@ -135,7 +135,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
                            stream>>>(loss_grad_ptr, loss_ptr, loss_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(loss_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      loss_grad_ptr, loss_grad_volume, 0, scale_factor);
+      loss_grad_ptr, loss_grad_volume, 0.0f, scale_factor);
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index a17e156f18..7a1da2e974 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -43,7 +43,8 @@ bool operator==(AddBiasResidualLayerNormParams const &lhs,
                 AddBiasResidualLayerNormParams const &rhs) {
   return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes &&
          lhs.elementwise_affine == rhs.elementwise_affine &&
-         lhs.use_bias == rhs.use_bias;
+         lhs.use_bias == rhs.use_bias &&
+         lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool AddBiasResidualLayerNormParams::is_valid(
@@ -58,7 +59,8 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const {
   params.elementwise_affine = this->elementwise_affine;
   params.eps = this->eps;
   params.use_bias = this->use_bias;
-  if (this->name != nullptr) {
+  params.inplace_residual = this->inplace_residual;
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -71,6 +73,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input,
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            DataType data_type,
                                            char const *name) {
   // In PyTorch, axes must be the sizes of the last axes.size() dimensions of
@@ -171,6 +174,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input,
   ln->add_int_property("use_bias", use_bias);
   ln->add_int_vector_property("axes", axes);
   ln->add_float_property("eps", eps);
+  ln->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(ln);
   outputs[0] = ln->outputs[0];
   outputs[1] = ln->outputs[1];
@@ -189,6 +193,8 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer(
   layer->get_int_vector_property("axes", axes);
   float eps;
   layer->get_float_property("eps", eps);
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
   return new AddBiasResidualLayerNorm(model,
                                       layer->layer_guid,
                                       inputs[0],
@@ -197,6 +203,7 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer(
                                       elementwise_affine,
                                       use_bias,
                                       eps,
+                                      inplace_residual,
                                       false, // allocate_weights
                                       layer->name);
 }
@@ -215,6 +222,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
                                params.elementwise_affine,
                                params.use_bias,
                                params.eps,
+                               params.inplace_residual,
                                allocate_weights,
                                params.name) {}
 
@@ -227,6 +235,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
     bool _elementwise_affine,
     bool _use_bias,
     float _eps,
+    bool _inplace_residual,
     bool allocate_weights,
     char const *name)
     : Op(model,
@@ -239,7 +248,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
          _input,
          _residual),
       elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes),
-      use_bias(_use_bias) {
+      use_bias(_use_bias), inplace_residual(_inplace_residual) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -348,48 +357,57 @@ void AddBiasResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   // attn output
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  // added: attn_output + attn final bias + residual
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -397,7 +415,7 @@ void AddBiasResidualLayerNorm::init_inference(
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -420,48 +438,56 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  // attn output
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(outputs[0]->part == inputs[0]->part);
+    assert(outputs[0]->region == inputs[0]->region);
+  }
+  // input: attn output
+  // added: attn_output + attn final bias + residual
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -469,7 +495,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -478,13 +504,11 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
 }
 
 /*
-  regions[0](I): attn output
-  regions[1](I): residual
-  regions[2](O): added output (attn output + final attn bias + residual)
-  regions[3](O): layer norm output
-  regions[4](I): final attn bias
-  regions[5](I): gamma
-  regions[6](I): beta
+  regions[0](I/O): attn output AND added output (attn output + final attn bias +
+  residual) regions[1](I): residual regions[2](O): layer norm output
+  regions[3](I): final attn bias
+  regions[4](I): gamma
+  regions[5](I): beta
 */
 OpMeta *AddBiasResidualLayerNorm::init_task(
     Task const *task,
@@ -517,10 +541,6 @@ void AddBiasResidualLayerNorm::forward(FFModel const &ff) {
   assert(false);
 }
 
-void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
-  assert(false);
-}
-
 FutureMap AddBiasResidualLayerNorm::inference(
     FFModel const &ff,
     BatchConfigFuture const &bc,
@@ -546,69 +566,94 @@ FutureMap AddBiasResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  // attn output
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  // input
+  // added_output: input + attn bias + residual
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
+  // attn bias
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+                                                    weights[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  // layer norm output
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
+  // output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
-  // attn final bias
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
+    // gamma
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
-
+    launcher.add_field(fid++, FID_DATA);
     if (use_bias) {
+      // beta
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
                                                         0 /*projection id*/,
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   return runtime->execute_index_space(ctx, launcher);
 }
 
+void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
+}
+
 /*
-  regions[0](I): attn output
-  regions[1](I): residual
-  regions[2](O): added output (attn output + final attn bias + residual)
-  regions[3](O): layer norm output
-  regions[4](I): final attn bias
-  regions[5](I): gamma
-  regions[6](I): beta
+  regions[0](I): input / added output
+  regions[1](I): attn bias
+  regions[2](I): residual
+  regions[3](O): output
+  regions[4](I): gamma
+  regions[5](I): beta
 */
 void AddBiasResidualLayerNorm::inference_task(
     Task const *task,
@@ -626,30 +671,72 @@ void AddBiasResidualLayerNorm::inference_task(
       *((AddBiasResidualLayerNormMeta **)task->local_args);
 
   assert(regions.size() ==
-         5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+         4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
+
+  int rid = 0, tid = 0, did = 0;
+  GenericTensorAccessorR input =
+      helperGetGenericTensorAccessorRO(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR attn_bias =
+      helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR residual =
+      helperGetGenericTensorAccessorRO(m->input_type[1],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW added_output;
+  if (m->inplace_residual) {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[0],
+                                                    task->regions[0],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  } else {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[rid++],
+                                                    task->regions[tid++],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  }
+  GenericTensorAccessorW output =
+      helperGetGenericTensorAccessorWO(m->output_type[1],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
   GenericTensorAccessorR gamma, beta;
 
   Domain in_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+  Domain attn_bias_domain = runtime->get_index_space_domain(
+      ctx, task->regions[did++].region.get_index_space());
   Domain residual_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+  Domain added_out_domain;
+  if (m->inplace_residual) {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+  } else {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[did++].region.get_index_space());
+  }
   Domain out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[3].region.get_index_space());
-  Domain attn_bias_domain = runtime->get_index_space_domain(
-      ctx, task->regions[4].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+
   Domain gamma_domain, beta_domain;
 
   assert(in_domain.get_volume() == out_domain.get_volume());
@@ -673,23 +760,23 @@ void AddBiasResidualLayerNorm::inference_task(
 
   if (m->elementwise_affine) {
     gamma = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                             regions[5],
-                                             task->regions[5],
+                                             regions[rid++],
+                                             task->regions[tid++],
                                              FID_DATA,
                                              ctx,
                                              runtime);
     gamma_domain = runtime->get_index_space_domain(
-        ctx, task->regions[5].region.get_index_space());
+        ctx, task->regions[did++].region.get_index_space());
 
     if (m->use_bias) {
       beta = helperGetGenericTensorAccessorRO(m->weight_type[2],
-                                              regions[6],
-                                              task->regions[6],
+                                              regions[rid++],
+                                              task->regions[tid++],
                                               FID_DATA,
                                               ctx,
                                               runtime);
       beta_domain = runtime->get_index_space_domain(
-          ctx, task->regions[6].region.get_index_space());
+          ctx, task->regions[did++].region.get_index_space());
       assert(gamma_domain == beta_domain);
     }
 
@@ -707,16 +794,7 @@ void AddBiasResidualLayerNorm::inference_task(
   }
 
   AddBiasResidualLayerNorm::inference_kernel_wrapper(
-      m,
-      (int)attn_bias_dim,
-      (int)residual_domain.get_volume(),
-      input,
-      added_output,
-      output,
-      residual,
-      attn_bias,
-      gamma,
-      beta);
+      m, bc, input, attn_bias, residual, added_output, output, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -729,13 +807,299 @@ void AddBiasResidualLayerNorm::inference_task(
         weights_accessors.push_back(beta);
       }
     }
+    AddBiasResidualLayerNorm::save_inference_tensors_to_file(
+        m, shard_id, bc, {residual}, weights_accessors, {added_output, output});
+  }
+}
+
+void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // added output
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // attn bias
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
+    launcher.add_field(field_id++, FID_DATA);
+    // gamma_grad
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+    if (use_bias) {
+      // beta_grad
+      launcher.add_region_requirement(
+          RegionRequirement(weights[2]->part_grad,
+                            0 /*projection id*/,
+                            READ_WRITE,
+                            EXCLUSIVE,
+                            weights[2]->region_grad));
+      launcher.add_field(field_id++, FID_DATA);
+    }
+  }
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void AddBiasResidualLayerNorm::backward_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  AddBiasResidualLayerNormMeta *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         5 + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR added_output =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW attn_bias_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[2],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad =
+        helperGetGenericTensorAccessorRW(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+    if (m->use_bias) {
+      beta_grad =
+          helperGetGenericTensorAccessorRW(m->output_type[0],
+                                           regions[region_idx++],
+                                           task->regions[task_region_idx++],
+                                           FID_DATA,
+                                           ctx,
+                                           runtime);
+    }
+  }
+  AddBiasResidualLayerNorm::backward_kernel_wrapper(m,
+                                                    output_grad,
+                                                    added_output,
+                                                    input_grad,
+                                                    residual_grad,
+                                                    attn_bias_grad,
+                                                    gamma,
+                                                    gamma_grad,
+                                                    beta_grad);
+}
+
+Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void AddBiasResidualLayerNorm::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(task->regions.size() == regions.size());
+  AddBiasResidualLayerNormMeta *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() == 3 + m->elementwise_affine);
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  GenericTensorAccessorR gamma;
+  if (m->elementwise_affine) {
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+  }
+  AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+      m, output_grad, input_grad, residual_grad, gamma);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    std::vector<GenericTensorAccessorR> weights_accessors;
+    if (m->elementwise_affine) {
+      weights_accessors.push_back(gamma);
+    }
     AddBiasResidualLayerNorm::save_inference_tensors_to_file(
         m,
         shard_id,
         bc,
-        {input, residual},
+        {input_grad, residual_grad},
         weights_accessors,
-        {added_output, output});
+        {output_grad},
+        false /*fwd_pass*/);
   }
 }
 
@@ -755,6 +1119,7 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->elementwise_affine);
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -771,6 +1136,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   bool elementwise_affine;
   bool use_bias;
   float eps;
+  bool inplace_residual;
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
@@ -785,6 +1151,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(elementwise_affine);
   dez.deserialize(eps);
   dez.deserialize(use_bias);
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -796,6 +1163,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   params.elementwise_affine = elementwise_affine;
   params.eps = eps;
   params.use_bias = use_bias;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   return ff.get_or_create_node<AddBiasResidualLayerNorm>({inputs[0], inputs[1]},
                                                          params);
@@ -816,6 +1184,7 @@ size_t hash<FlexFlow::AddBiasResidualLayerNormParams>::operator()(
   }
   hash_combine(key, params.elementwise_affine);
   hash_combine(key, params.use_bias);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 1add43ecd9..681f55c998 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -23,12 +23,13 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
     AddBiasResidualLayerNorm const *ln,
     MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
@@ -45,6 +46,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) {
@@ -75,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -84,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -94,53 +94,36 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
 }
 
 template <typename T>
-__global__ void LayerNormFusedForwardKernel(int attn_bias_dim,
-                                            int residual_volume,
-                                            int64_t effective_num_elements,
-                                            int64_t effective_batch_size,
+__global__ void LayerNormFusedForwardKernel(int64_t N,
+                                            int64_t attn_bias_dim,
                                             float eps,
                                             T const *input_ptr,
                                             T const *attn_bias_ptr,
                                             T const *residual_ptr,
-                                            T *added_output_ptr,
-                                            T *output_ptr,
-                                            T const *gamma_ptr,
-                                            T const *beta_ptr,
+                                            T *X,
                                             T *mean,
-                                            T *rstd) {
-  // Add attention bias and residual
-  CUDA_KERNEL_LOOP(i, residual_volume) {
-    int bias_idx = i % attn_bias_dim;
-    added_output_ptr[i] =
-        input_ptr[i] + attn_bias_ptr[bias_idx] + residual_ptr[i];
-  }
-
-  __syncthreads();
-
-  // LayerNorm
+                                            T *rstd,
+                                            T const *gamma,
+                                            T const *beta,
+                                            T *Y) {
   __shared__ float m_shared[C10_WARP_SIZE];
   __shared__ float v_shared[C10_WARP_SIZE];
   const int64_t i = blockIdx.x;
-  if (i >= effective_batch_size) {
-    return;
-  }
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < effective_num_elements;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
-    const int64_t index = i * effective_num_elements + j;
-    sum1 += static_cast<float>(added_output_ptr[index]);
-    sum2 += static_cast<float>(added_output_ptr[index]) *
-            static_cast<float>(added_output_ptr[index]);
-  }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const int64_t bias_idx = index % attn_bias_dim;
+    X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index];
+    sum1 += static_cast<float>(X[index]);
+    sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
-    float const scale = float(1) / static_cast<float>(effective_num_elements);
+    float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
     sum2 = max(sum2 * scale - sum1 * sum1, float(0));
     mean[i] = static_cast<T>(sum1);
@@ -150,17 +133,15 @@ __global__ void LayerNormFusedForwardKernel(int attn_bias_dim,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < effective_num_elements;
-       j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * effective_num_elements + j;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
     const T_ACC gamma_v =
-        gamma_ptr == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma_ptr[j]);
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
     const T_ACC beta_v =
-        beta_ptr == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta_ptr[j]);
-    output_ptr[index] = (static_cast<T_ACC>(added_output_ptr[index]) -
-                         static_cast<T_ACC>(mean[i])) *
-                            static_cast<T_ACC>(rstd[i]) * gamma_v +
-                        beta_v;
+        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
+    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+                   static_cast<T_ACC>(rstd[i]) * gamma_v +
+               beta_v;
   }
 }
 
@@ -178,57 +159,108 @@ void AddBiasResidualLayerNorm::inference_kernel(
     T const *gamma_ptr,
     T const *beta_ptr,
     hipStream_t stream) {
-
-  std::pair<int, int> kernel1_parallelism = std::make_pair(
-      GET_BLOCKS(residual_volume), std::min(residual_volume, CUDA_NUM_THREADS));
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel3_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks = std::max({kernel1_parallelism.first,
-                             kernel2_parallelism.first,
-                             kernel3_parallelism.first});
-  int num_threads = std::max({kernel1_parallelism.second,
-                              kernel2_parallelism.second,
-                              kernel3_parallelism.second});
-
   hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormFusedForwardKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->effective_batch_size,
+                     std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
                      0,
                      stream,
-                     attn_bias_dim,
-                     residual_volume,
                      m->effective_num_elements,
-                     m->effective_batch_size,
+                     attn_bias_dim,
                      m->eps,
                      input_ptr,
                      attn_bias_ptr,
                      residual_ptr,
                      added_output_ptr,
-                     output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
                      gamma_ptr,
                      beta_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr));
+                     output_ptr);
 }
 
 /*static*/
 void AddBiasResidualLayerNorm::inference_kernel_wrapper(
-    AddBiasResidualLayerNormMeta const *m,
-    int attn_bias_dim,
-    int residual_volume,
+    AddBiasResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &attn_bias,
+    GenericTensorAccessorR const &residual,
     GenericTensorAccessorW &added_output,
     GenericTensorAccessorW &output,
-    GenericTensorAccessorR const &residual,
-    GenericTensorAccessorR const &attn_bias,
     GenericTensorAccessorR const &gamma,
     GenericTensorAccessorR const &beta) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  // inference kernel
+  int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
+  int residual_volume = residual.domain.get_volume();
   if (m->input_type[0] == DT_FLOAT) {
     AddBiasResidualLayerNorm::inference_kernel<float>(
         m,
@@ -239,8 +271,8 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         residual.get_float_ptr(),
         added_output.get_float_ptr(),
         output.get_float_ptr(),
-        gamma.get_float_ptr(),
-        m->use_bias ? beta.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
         stream);
   } else if (m->input_type[0] == DT_HALF) {
     AddBiasResidualLayerNorm::inference_kernel<half>(
@@ -252,12 +284,566 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         residual.get_half_ptr(),
         added_output.get_half_ptr(),
         output.get_half_ptr(),
-        gamma.get_half_ptr(),
-        m->use_bias ? beta.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // if (m->input_type[0] == DT_FLOAT) {
+    //   print_tensor<float>(input.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:input]");
+    //   print_tensor<float>(attn_bias.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:attn_bias]");
+    //   print_tensor<float>(residual.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:residual]");
+    //   print_tensor<float>(added_output.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:added_output]");
+    //   print_tensor<float>(output.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:output]");
+    //   print_tensor<float>(gamma.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:gamma]");
+    //   print_tensor<float>(
+    //       beta.get_float_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:beta]");
+    // } else {
+    //   print_tensor<half>(
+    //       input.get_half_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:input]");
+    //   print_tensor<half>(attn_bias.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:attn_bias]");
+    //   print_tensor<half>(residual.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:residual]");
+    //   print_tensor<half>(added_output.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:added_output]");
+    //   print_tensor<half>(output.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:output]");
+    //   print_tensor<half>(
+    //       gamma.get_half_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:gamma]");
+    //   print_tensor<half>(
+    //       beta.get_half_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:beta]");
+    // }
+    // print_tensor<T>(in_ptr, 32, "[AddBiasResidualLayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32,
+    // "[AddBiasResidualLayerNorm:forward:output]");
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual_i = dX_residual + i1 * N;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad) {
+      dX_residual_i[l] = f_grad_input;
+    } else {
+      dX_residual_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual,
+             reset_input_grad,
+             reset_residual_grad,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::backward_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T const *added_output_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    T *gamma_grad_ptr,
+    T *beta_grad_ptr,
+    hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     gamma_ptr,
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
+                     B,
+                     kCUDANumThreads,
+                     0,
+                     stream,
+                     M,
+                     N,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel<T>),
+                         B,
+                         kCUDANumThreads,
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel<T>),
+                         B,
+                         dim3(kThreadX, kThreadY),
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::backward_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR &added_output,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual_grad.get_float_ptr(),
+        attn_bias_grad.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual_grad.get_half_ptr(),
+        attn_bias_grad.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::peft_bwd_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T const *gamma_ptr,
+    hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     static_cast<T const *>(m->input_activation),
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     N);
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorR const &gamma) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual_grad.get_float_ptr(),
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual_grad.get_half_ptr(),
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ceb1a6514e..bcca1ba2c6 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -22,12 +22,13 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
     AddBiasResidualLayerNorm const *ln,
     MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
@@ -44,6 +45,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) {
@@ -74,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -83,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -110,20 +110,17 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const int64_t bias_idx = index % attn_bias_dim;
     X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index];
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -135,7 +132,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -161,42 +158,33 @@ void AddBiasResidualLayerNorm::inference_kernel(
     T const *gamma_ptr,
     T const *beta_ptr,
     cudaStream_t stream) {
-
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   LayerNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               attn_bias_dim,
-                                               m->eps,
-                                               input_ptr,
-                                               attn_bias_ptr,
-                                               residual_ptr,
-                                               added_output_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               output_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   attn_bias_dim,
+                   m->eps,
+                   input_ptr,
+                   attn_bias_ptr,
+                   residual_ptr,
+                   added_output_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   output_ptr);
 }
 
 /*static*/
 void AddBiasResidualLayerNorm::inference_kernel_wrapper(
-    AddBiasResidualLayerNormMeta const *m,
-    int attn_bias_dim,
-    int residual_volume,
+    AddBiasResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &attn_bias,
+    GenericTensorAccessorR const &residual,
     GenericTensorAccessorW &added_output,
     GenericTensorAccessorW &output,
-    GenericTensorAccessorR const &residual,
-    GenericTensorAccessorR const &attn_bias,
     GenericTensorAccessorR const &gamma,
     GenericTensorAccessorR const &beta) {
   cudaStream_t stream;
@@ -208,6 +196,69 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  // inference kernel
+  int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
+  int residual_volume = residual.domain.get_volume();
   if (m->input_type[0] == DT_FLOAT) {
     AddBiasResidualLayerNorm::inference_kernel<float>(
         m,
@@ -297,4 +348,478 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual_i = dX_residual + i1 * N;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad) {
+      dX_residual_i[l] = f_grad_input;
+    } else {
+      dX_residual_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual,
+             reset_input_grad,
+             reset_residual_grad,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::backward_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T const *added_output_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    T *gamma_grad_ptr,
+    T *beta_grad_ptr,
+    cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          added_output_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      added_output_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              added_output_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              added_output_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::backward_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR &added_output,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual_grad.get_float_ptr(),
+        attn_bias_grad.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual_grad.get_half_ptr(),
+        attn_bias_grad.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::peft_bwd_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T const *gamma_ptr,
+    cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T const *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      N);
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorR const &gamma) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual_grad.get_float_ptr(),
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual_grad.get_half_ptr(),
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 5f05458e34..c83b738a0e 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -85,7 +85,7 @@ AggregateParams Aggregate::get_params() const {
   AggregateParams params;
   params.n = this->n;
   params.lambda_bal = this->lambda_bal;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -242,7 +242,7 @@ OpMeta *Aggregate::init_task(Task const *task,
                              Runtime *runtime) {
   Aggregate *agg = (Aggregate *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  AggregateMeta *m = new AggregateMeta(handle, agg->n);
+  AggregateMeta *m = new AggregateMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
@@ -603,7 +603,7 @@ bool Aggregate::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  AggregateMeta *m = new AggregateMeta(sim->handler, n);
+  AggregateMeta *m = new AggregateMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp
index d5ebdb0c22..5a508cfac4 100644
--- a/src/ops/aggregate.cpp
+++ b/src/ops/aggregate.cpp
@@ -281,13 +281,14 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m,
                      out_dim);
 }
 
-AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) {
-  checkCUDA(hipMalloc(&dev_exp_preds, n * sizeof(float *)));
-  checkCUDA(hipMalloc(&dev_exp_grads, n * sizeof(float *)));
+AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(hipMalloc(&dev_exp_preds, aggr->n * sizeof(float *)));
+  checkCUDA(hipMalloc(&dev_exp_grads, aggr->n * sizeof(float *)));
 }
 AggregateMeta::~AggregateMeta(void) {
   checkCUDA(hipFree(&dev_exp_preds));
   checkCUDA(hipFree(&dev_exp_grads));
 }
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu
index 38e141b252..9704302092 100644
--- a/src/ops/aggregate.cu
+++ b/src/ops/aggregate.cu
@@ -307,9 +307,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m,
   }
 }
 
-AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) {
-  checkCUDA(cudaMalloc(&dev_exp_preds, n * sizeof(float *)));
-  checkCUDA(cudaMalloc(&dev_exp_grads, n * sizeof(float *)));
+AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(cudaMalloc(&dev_exp_preds, aggr->n * sizeof(float *)));
+  checkCUDA(cudaMalloc(&dev_exp_grads, aggr->n * sizeof(float *)));
 }
 AggregateMeta::~AggregateMeta(void) {
   checkCUDA(cudaFree(&dev_exp_preds));
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 1edd430881..6ea3ff3747 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -84,7 +84,7 @@ AggregateSpecParams AggregateSpec::get_params() const {
   AggregateSpecParams params;
   params.n = this->n;
   params.lambda_bal = this->lambda_bal;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -210,7 +210,7 @@ OpMeta *AggregateSpec::init_task(Task const *task,
                                  Runtime *runtime) {
   AggregateSpec *agg = (AggregateSpec *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n);
+  AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
@@ -543,7 +543,7 @@ bool AggregateSpec::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, n);
+  AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp
index 314e20a59c..a676fa81c3 100644
--- a/src/ops/aggregate_spec.cpp
+++ b/src/ops/aggregate_spec.cpp
@@ -290,9 +290,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m,
                      out_dim);
 }
 
-AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n)
-    : OpMeta(handler) {
-  checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *)));
+AggregateSpecMeta::AggregateSpecMeta(FFHandler handler,
+                                     AggregateSpec const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(hipMalloc(&dev_region_ptrs, aggr->n * sizeof(float *)));
 }
 AggregateSpecMeta::~AggregateSpecMeta(void) {
   checkCUDA(hipFree(&dev_region_ptrs));
diff --git a/src/ops/aggregate_spec.cu b/src/ops/aggregate_spec.cu
index 8d50d45d21..ac5a372efc 100644
--- a/src/ops/aggregate_spec.cu
+++ b/src/ops/aggregate_spec.cu
@@ -287,9 +287,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m,
                                       out_dim);
 }
 
-AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n)
-    : OpMeta(handler) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
+AggregateSpecMeta::AggregateSpecMeta(FFHandler handler,
+                                     AggregateSpec const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(cudaMalloc(&dev_region_ptrs, aggr->n * sizeof(float *)));
 }
 AggregateSpecMeta::~AggregateSpecMeta(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 780a77450e..534bac2419 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -112,7 +112,7 @@ ArgTopKParams ArgTopK::get_params() const {
   params.k = this->k;
   params.sorted = this->sorted;
   params.speculative_decoding = this->speculative_decoding;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -387,7 +387,7 @@ InferenceResult
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW probs;
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   ArgTopK::forward_kernel_wrapper(
       m, input, probs, indices, batch_size, nullptr);
 
@@ -399,7 +399,7 @@ InferenceResult
   }
 
   InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
@@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
   ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
 
   BeamInferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
-  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  copy_tensor_dev_to_host<float>(
+      probs.get_float_ptr(), ir.probs, batch_size * m->k);
   return ir;
 }
 
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index 1892ac2353..4123e50e7e 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -91,7 +91,7 @@ Op *ArgMax::create_operator_from_layer(
 ArgMaxParams ArgMax::get_params() const {
   ArgMaxParams params;
   params.beam_search = this->beam_search;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -314,7 +314,7 @@ FutureMap ArgMax::inference(FFModel const &ff,
     launcher.add_future(bc);
     launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                       0 /*projection id*/,
-                                                      READ_WRITE,
+                                                      READ_ONLY,
                                                       EXCLUSIVE,
                                                       batch_inputs[0]->region));
     launcher.add_field(0, FID_DATA);
@@ -348,15 +348,18 @@ BeamInferenceResult
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
+  float loss = 0.0f;
+  ArgMax::forward_kernel_wrapper(
+      m, bc, input, indices, parent, batch_size, &loss);
   BeamInferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
-  download_tensor(m->probs, ir.probs, batch_size);
-  download_tensor<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
+  copy_tensor_dev_to_host(m->probs, ir.probs, batch_size);
+  copy_tensor_dev_to_host<int>(
+      parent.get_int32_ptr(), ir.parent_id, batch_size);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -383,23 +386,36 @@ InferenceResult
     return ir;
   }
 
-  GenericTensorAccessorW input = helperGetGenericTensorAccessorRW(
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW parent;
-  int batch_size = bc->num_active_tokens();
-  ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
+  int batch_size = bc->num_active_infr_tokens();
+  float loss = 0.0f;
+
+  ArgMax::forward_kernel_wrapper(
+      m, bc, input, indices, parent, batch_size, &loss);
+
   InferenceResult ir;
+  ir.finetuning_loss = loss;
+
+  if (bc->num_active_peft_tokens() > 0) {
+    printf("Loss: %.4f\n", loss);
+  }
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     ArgMax::save_inference_tensors_to_file(
-        m, shard_id, bc, {}, {}, {input, indices});
+        m, shard_id, bc, {input}, {}, {indices});
+  } else {
+    m->decoding_step++;
   }
 
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
+
   return ir;
 }
 
@@ -453,4 +469,4 @@ size_t hash<FlexFlow::ArgMaxParams>::operator()(
   hash_combine(key, params.beam_search);
   return key;
 }
-}; // namespace std
\ No newline at end of file
+}; // namespace std
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 8a1cf0b3b0..60d44cdf2b 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -334,6 +334,21 @@ __device__ void mergeShards(int num_shards,
   }
 }
 
+template <typename DT>
+__global__ void compute_sparse_categorical_crossentropy_loss(
+    DT const *logits,
+    BatchConfig::TokenId const *labels,
+    float *loss,
+    int num_tokens,
+    int num_classes) {
+  float const LOG_MIN_VALUE = 0.00000001f;
+  CUDA_KERNEL_LOOP(b, num_tokens) {
+    float my_logit =
+        max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE);
+    atomicAdd(loss, -log(my_logit));
+  }
+}
+
 template <typename T>
 __global__ void argmax_forward_kernel(T const *__restrict__ input,
                                       size_t shared_memory_size,
@@ -381,14 +396,16 @@ __global__ void copy_result(hipcub::KeyValuePair<int, DT> *d_out,
 /*static*/
 template <typename DT>
 void ArgMax::forward_kernel(ArgMaxMeta const *m,
-                            DT *input_ptr,
+                            BatchConfig const *bc,
+                            DT const *input_ptr,
                             int *indices_ptr,
                             float *prob_ptr,
                             int *parent,
                             int const length,
                             int const batch_size,
+                            float *loss,
                             hipStream_t stream) {
-  checkCUDA(get_legion_stream(&stream));
+
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
 
   if (m->beam_search) {
@@ -425,28 +442,77 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                      k,
                      prob_ptr,
                      indices_ptr);
+
+  // compute cross-entropy loss if there is a finetuning request
+  assert(loss != nullptr);
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int num_finetuning_requests = 0, num_bwd_tokens = 0;
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_bwd) {
+      assert(num_finetuning_requests == 0 && num_bwd_tokens == 0);
+      num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+      // shift labels by 1 position to the left (ignore first token label)
+      for (int j = 0; j < num_bwd_tokens; j++) {
+        token_ids[j] =
+            bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+      }
+      num_finetuning_requests += 1;
+    } else {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  assert(num_finetuning_requests <= 1);
+  if (num_bwd_tokens > 0) {
+    checkCUDA(hipMemcpyAsync(m->handle.workSpace,
+                             token_ids,
+                             sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                             hipMemcpyHostToDevice,
+                             stream));
+    // copy loss to d_loss
+    checkCUDA(hipMemsetAsync(m->d_loss, 0, sizeof(float), stream));
+    compute_sparse_categorical_crossentropy_loss<<<GET_BLOCKS(num_bwd_tokens),
+                                                   min(CUDA_NUM_THREADS,
+                                                       num_bwd_tokens),
+                                                   0,
+                                                   stream>>>(
+        input_ptr,
+        static_cast<BatchConfig::TokenId *>(m->handle.workSpace),
+        m->d_loss,
+        num_bwd_tokens,
+        length);
+    // copy value from d_loss to loss
+    checkCUDA(hipMemcpyAsync(
+        loss, m->d_loss, sizeof(float), hipMemcpyDeviceToHost, stream));
+    *loss = *loss / (float)num_bwd_tokens;
+  }
 }
 
 /*static*/
 void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
-                                    GenericTensorAccessorW const &input,
+                                    BatchConfig const *bc,
+                                    GenericTensorAccessorR const &input,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
-                                    int batch_size) {
+                                    int batch_size,
+                                    float *loss) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
-
   int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
 
   if (input.data_type == DT_HALF) {
     ArgMax::forward_kernel<half>(m,
+                                 bc,
                                  input.get_half_ptr(),
                                  indices.get_int32_ptr(),
                                  m->probs,
@@ -454,10 +520,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                 : nullptr,
                                  length,
                                  batch_size,
+                                 loss,
                                  stream);
 
   } else if (input.data_type == DT_FLOAT) {
     ArgMax::forward_kernel<float>(m,
+                                  bc,
                                   input.get_float_ptr(),
                                   indices.get_int32_ptr(),
                                   m->probs,
@@ -465,6 +533,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                  : nullptr,
                                   length,
                                   batch_size,
+                                  loss,
                                   stream);
   } else {
     assert(false && "Unsupported data type");
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 05c84719c1..8a2e2da2d0 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -44,19 +44,35 @@ __global__ void copy_result(cub::KeyValuePair<int, DT> *d_out,
   }
 }
 
+template <typename DT>
+__global__ void compute_sparse_categorical_crossentropy_loss(
+    DT const *logits,
+    BatchConfig::TokenId const *labels,
+    float *loss,
+    int num_tokens,
+    int num_classes) {
+  float const LOG_MIN_VALUE = 0.00000001f;
+  CUDA_KERNEL_LOOP(b, num_tokens) {
+    float my_logit =
+        max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE);
+    atomicAdd(loss, -log(my_logit));
+  }
+}
+
 /*static*/
 template <typename DT>
 void ArgMax::forward_kernel(ArgMaxMeta const *m,
-                            DT *input_ptr,
+                            BatchConfig const *bc,
+                            DT const *input_ptr,
                             int *indices_ptr,
                             float *prob_ptr,
                             int *parent,
                             int const length,
                             int const batch_size,
+                            float *loss,
                             cudaStream_t stream) {
-
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
+
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
     checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
@@ -73,7 +89,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
       m->d_offsets + 1,
       stream));
 
-  // copy dout to incides
+  // copy dout to indices
   int parallelism = batch_size;
   copy_result<<<GET_BLOCKS(parallelism),
                 min(CUDA_NUM_THREADS, parallelism),
@@ -84,14 +100,64 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           batch_size,
                           m->beam_search);
   // print_tensor<int>(indices_ptr, 32, "argmax op");
+
+  // compute cross-entropy loss if there is a finetuning request
+  assert(loss != nullptr);
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int num_finetuning_requests = 0, num_bwd_tokens = 0;
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_bwd) {
+      assert(num_finetuning_requests == 0 && num_bwd_tokens == 0);
+      num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+      // shift labels by 1 position to the left (ignore first token label)
+      for (int j = 0; j < num_bwd_tokens; j++) {
+        token_ids[j] =
+            bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+      }
+      num_finetuning_requests += 1;
+    } else {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  assert(num_finetuning_requests <= 1);
+  if (num_bwd_tokens > 0) {
+    checkCUDA(cudaMemcpyAsync(m->handle.workSpace,
+                              token_ids,
+                              sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
+    // copy loss to d_loss
+    checkCUDA(cudaMemsetAsync(m->d_loss, 0, sizeof(float), stream));
+    compute_sparse_categorical_crossentropy_loss<<<GET_BLOCKS(num_bwd_tokens),
+                                                   min(CUDA_NUM_THREADS,
+                                                       num_bwd_tokens),
+                                                   0,
+                                                   stream>>>(
+        input_ptr,
+        static_cast<BatchConfig::TokenId *>(m->handle.workSpace),
+        m->d_loss,
+        num_bwd_tokens,
+        length);
+    // copy value from d_loss to loss
+    checkCUDA(cudaMemcpyAsync(
+        loss, m->d_loss, sizeof(float), cudaMemcpyDeviceToHost, stream));
+    *loss = *loss / (float)num_bwd_tokens;
+  }
 }
 
 /*static*/
 void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
-                                    GenericTensorAccessorW const &input,
+                                    BatchConfig const *bc,
+                                    GenericTensorAccessorR const &input,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
-                                    int batch_size) {
+                                    int batch_size,
+                                    float *loss) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -104,6 +170,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
 
   if (input.data_type == DT_HALF) {
     ArgMax::forward_kernel<half>(m,
+                                 bc,
                                  input.get_half_ptr(),
                                  indices.get_int32_ptr(),
                                  m->probs,
@@ -111,10 +178,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                 : nullptr,
                                  length,
                                  batch_size,
+                                 loss,
                                  stream);
 
   } else if (input.data_type == DT_FLOAT) {
     ArgMax::forward_kernel<float>(m,
+                                  bc,
                                   input.get_float_ptr(),
                                   indices.get_int32_ptr(),
                                   m->probs,
@@ -122,6 +191,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                  : nullptr,
                                   length,
                                   batch_size,
+                                  loss,
                                   stream);
   } else {
     assert(false && "Unsupported data type");
@@ -202,6 +272,10 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
   gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
+
+  // allocate space for loss on device
+  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(float));
+  d_loss = gpu_mem_allocator.allocate_instance<float>(1);
 }
 
 ArgMaxMeta::~ArgMaxMeta(void) {
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index 203662d3ec..aef4f0a16a 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -1010,7 +1010,7 @@ MultiHeadAttentionParams MultiHeadAttention::get_params() const {
   params.bias = this->bias;
   params.add_bias_kv = this->add_bias_kv;
   params.add_zero_attn = this->add_zero_attn;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp
index ee7f87a7fb..10655a4a1a 100644
--- a/src/ops/attention.cpp
+++ b/src/ops/attention.cpp
@@ -156,7 +156,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
                                                Memory gpu_mem,
                                                int num_samples,
                                                int num_heads)
-    : OpMeta(handler) {
+    : OpMeta(handler, attn) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
diff --git a/src/ops/attention.cu b/src/ops/attention.cu
index 18fc810aed..4c460cdbbf 100644
--- a/src/ops/attention.cu
+++ b/src/ops/attention.cu
@@ -194,7 +194,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
                                                Memory gpu_mem,
                                                int num_samples,
                                                int num_heads)
-    : OpMeta(handler) {
+    : OpMeta(handler, attn) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc
index e13169f6c1..e5f0611fb0 100644
--- a/src/ops/batch_matmul.cc
+++ b/src/ops/batch_matmul.cc
@@ -279,7 +279,7 @@ OpMeta *BatchMatmul::init_task(Task const *task,
                                Runtime *runtime) {
   BatchMatmul const *bmm = (BatchMatmul *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  BatchMatmulMeta *m = new BatchMatmulMeta(handle);
+  BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm);
   m->profiling = bmm->profiling;
   m->inference_debugging = bmm->inference_debugging;
   m->a_seq_length_dim = bmm->a_seq_length_dim;
@@ -616,7 +616,7 @@ bool BatchMatmul::measure_operator_cost(Simulator *sim,
     batch *= sub_input0.dims[i].size;
   }
 
-  BatchMatmulMeta *meta = sim->batch_matmul_meta;
+  BatchMatmulMeta *meta = new BatchMatmulMeta(sim->handler, this);
 
   // allocate tensors in simulator
   sim->free_all();
diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp
index 7dee6fdaaf..5856f1dddf 100644
--- a/src/ops/batch_norm.cpp
+++ b/src/ops/batch_norm.cpp
@@ -284,7 +284,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
                              int output_c,
                              int output_h,
                              int output_w)
-    : OpMeta(handler) {
+    : OpMeta(handler, bn) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&biasTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu
index 929ebf81f8..01e993067a 100644
--- a/src/ops/batch_norm.cu
+++ b/src/ops/batch_norm.cu
@@ -270,7 +270,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
                              int output_c,
                              int output_h,
                              int output_w)
-    : OpMeta(handler) {
+    : OpMeta(handler, bn) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index 5f4547ace5..36cc7fd8fa 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -375,7 +375,7 @@ BeamInferenceResult
   // embedding size: eg. 4096
   int length = input_domain.hi()[0] - input_domain.lo()[0] + 1;
   // total token nums
-  size_t batch_size = bc.num_active_tokens();
+  size_t batch_size = bc.num_active_infr_tokens();
 
   // need meta for: how many sub requests in a main request
   BeamTopK::forward_kernel_wrapper(m,
@@ -390,9 +390,11 @@ BeamInferenceResult
 
   BeamInferenceResult ir;
 
-  download_tensor<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
-  download_tensor<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
-  download_tensor<int>(
+  copy_tensor_dev_to_host<int>(
+      index_ptr, ir.token_ids, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<float>(
+      value_ptr, ir.probs, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<int>(
       parent_ptr, ir.parent_id, batch_size * m->max_beam_width);
 
   if (m->inference_debugging) {
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
index 8545bea7cb..5d80707ea7 100644
--- a/src/ops/beam_topk.cpp
+++ b/src/ops/beam_topk.cpp
@@ -681,7 +681,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
 BeamTopKMeta::BeamTopKMeta(FFHandler handler,
                            Op const *op,
                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
+    : OpMeta(handler, op) {
   DataType data_type = op->inputs[0]->data_type;
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index c24bdf7c74..bf4c23cad0 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -723,7 +723,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
 BeamTopKMeta::BeamTopKMeta(FFHandler handler,
                            Op const *op,
                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
+    : OpMeta(handler, op) {
   DataType data_type = op->inputs[0]->data_type;
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
diff --git a/src/ops/cache.cc b/src/ops/cache.cc
index 691e45b559..33b862ae85 100644
--- a/src/ops/cache.cc
+++ b/src/ops/cache.cc
@@ -165,7 +165,7 @@ OpMeta *Cache::init_task(Task const *task,
                          Runtime *runtime) {
   Cache *c = (Cache *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  CacheMeta *m = new CacheMeta(handle);
+  CacheMeta *m = new CacheMeta(handle, c);
   m->cache_score = 0.0f;
   m->profiling = c->profiling;
   m->inference_debugging = c->inference_debugging;
diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp
index 95c5995f9e..a9512c2c59 100644
--- a/src/ops/cache.cpp
+++ b/src/ops/cache.cpp
@@ -75,7 +75,7 @@ float Cache::cache_update(Task const *task,
   return cache_score;
 }
 
-CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {}
+CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {}
 
 template void
     Cache::cache_forward<float>(Task const *task,
diff --git a/src/ops/cache.cu b/src/ops/cache.cu
index a113e57a1c..2f95e59669 100644
--- a/src/ops/cache.cu
+++ b/src/ops/cache.cu
@@ -74,7 +74,7 @@ float Cache::cache_update(Task const *task,
   return cache_score;
 }
 
-CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {}
+CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {}
 
 template void
     Cache::cache_forward<float>(Task const *task,
diff --git a/src/ops/cast.cc b/src/ops/cast.cc
index e514236a31..4a52bf874e 100644
--- a/src/ops/cast.cc
+++ b/src/ops/cast.cc
@@ -190,7 +190,7 @@ OpMeta *Cast::init_task(Task const *task,
                         Runtime *runtime) {
   Cast *cast = (Cast *)task->args;
   FFHandler handler = *((FFHandler const *)task->local_args);
-  CastMeta *m = new CastMeta(handler);
+  CastMeta *m = new CastMeta(handler, cast);
   m->input_data_type = cast->inputs[0]->data_type;
   m->output_data_type = cast->outputs[0]->data_type;
   std::strcpy(m->op_name, cast->name);
diff --git a/src/ops/concat.cc b/src/ops/concat.cc
index d4d8e525fc..0a82779b6d 100644
--- a/src/ops/concat.cc
+++ b/src/ops/concat.cc
@@ -197,7 +197,7 @@ OpMeta *Concat::init_task(Task const *task,
                           Runtime *runtime) {
   Concat *cc = (Concat *)task->args;
   FFHandler handler = *((FFHandler const *)task->local_args);
-  ConcatMeta *m = new ConcatMeta(handler);
+  ConcatMeta *m = new ConcatMeta(handler, cc);
   // Note that our internal axis index ordering is opposite to other frameworks
   init_meta(m, cc->legion_axis);
   m->profiling = cc->profiling;
@@ -365,7 +365,7 @@ bool Concat::measure_operator_cost(Simulator *sim,
     }
   }
 
-  ConcatMeta *m = sim->concat_meta;
+  ConcatMeta *m = new ConcatMeta(sim->handler, this);
   init_meta(m, this->legion_axis);
 
   sim->free_all();
diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc
index 94850a178d..2428c9b99a 100644
--- a/src/ops/conv_2d.cc
+++ b/src/ops/conv_2d.cc
@@ -588,12 +588,13 @@ OpMeta *Conv2D::init_task(Task const *task,
   //     regions[4], task->regions[4], FID_DATA, ctx, runtime,
   //     false/*readOutput*/);
 
-  Conv2DMeta *m = new Conv2DMeta(handle);
+  Conv2DMeta *m = new Conv2DMeta(handle, conv);
   m->relu = conv->activation == AC_MODE_RELU;
   m->use_bias = conv->use_bias;
   m->profiling = conv->profiling;
   m->inference_debugging = conv->inference_debugging;
-  m->trainableInputs[0] = conv->trainableInputs[0];
+  m->trainable_inputs[0] = conv->trainable_inputs[0];
+  m->reset_input_grads[0] = conv->trainable_inputs[0];
   std::strcpy(m->op_name, conv->name);
   m->layer_guid = conv->layer_guid;
 
@@ -753,7 +754,7 @@ void Conv2D::backward(FFModel const &ff) {
                                                     inputs[0]->region));
   launcher.add_field(rid++, FID_DATA);
   // regions[1](I/O): input_grad
-  if (trainableInputs[0]) {
+  if (trainable_inputs[0]) {
     launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                       0 /*projection id*/,
                                                       READ_WRITE,
@@ -803,7 +804,7 @@ void Conv2D::backward(FFModel const &ff) {
 
 /*
   region(I): input
-  region(I/O): input_grad (if trainableInputs[0])
+  region(I/O): input_grad (if trainable_inputs[0])
   region(I): output
   region(I/O): output_grad
   region(I): filter
@@ -816,17 +817,17 @@ void Conv2D::backward_task(Task const *task,
                            Runtime *runtime) {
   // Conv2D* conv = (Conv2D*) task->args;
   Conv2DMeta const *m = *((Conv2DMeta **)task->local_args);
-  assert(regions.size() == (5 + static_cast<size_t>(m->trainableInputs[0]) +
+  assert(regions.size() == (5 + static_cast<size_t>(m->trainable_inputs[0]) +
                             static_cast<size_t>(m->use_bias)));
   assert(task->regions.size() ==
-         (5 + static_cast<size_t>(m->trainableInputs[0]) +
+         (5 + static_cast<size_t>(m->trainable_inputs[0]) +
           static_cast<size_t>(m->use_bias)));
   size_t rid = 0;
   TensorAccessorR<float, Conv2DInput::NUMDIM> acc_input(
       regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
   rid++;
   float *acc_input_grad_ptr = NULL;
-  if (m->trainableInputs[0]) {
+  if (m->trainable_inputs[0]) {
     TensorAccessorW<float, Conv2DInput::NUMDIM> acc_input_grad(
         regions[rid],
         task->regions[rid],
@@ -1119,7 +1120,7 @@ bool Conv2D::measure_operator_cost(Simulator *sim,
   int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
   int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
 
-  Conv2DMeta *m = sim->conv2d_meta;
+  Conv2DMeta *m = new Conv2DMeta(sim->handler, this);
   m->relu = activation == AC_MODE_RELU;
   // require input_c is divisible by groups
 
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 4352f459b9..cf8696182b 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -429,7 +429,7 @@ OpMeta *ElementBinary::init_task(Task const *task,
   FFHandler handle = *((FFHandler *)task->local_args);
   ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb);
   for (int i = 0; i < eb->numInputs; i++) {
-    m->trainableInputs[i] = eb->trainableInputs[i];
+    m->trainable_inputs[i] = eb->trainable_inputs[i];
   }
   m->op_type = eb->op_type;
   m->profiling = eb->profiling;
@@ -892,7 +892,7 @@ void ElementBinary::backward(FFModel const &ff) {
                                                       inputs[0]->region));
     launcher.add_field(rid++, FID_DATA);
     // regions[2](I/O): input0_grad
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       launcher.add_region_requirement(
           RegionRequirement(inputs[0]->part_grad,
                             0 /*projection id*/,
@@ -910,7 +910,7 @@ void ElementBinary::backward(FFModel const &ff) {
                                                         inputs[1]->region));
       launcher.add_field(rid++, FID_DATA);
       // regions[4](I/O): input1_grad
-      if (trainableInputs[1]) {
+      if (trainable_inputs[1]) {
         launcher.add_region_requirement(
             RegionRequirement(inputs[1]->part_grad,
                               0 /*projection id*/,
@@ -980,7 +980,7 @@ void ElementBinary::backward_task(Task const *task,
     in0_ptr = helperGetTensorPointerRO<float>(
         regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
     rid++;
-    if (m->trainableInputs[0]) {
+    if (m->trainable_inputs[0]) {
       Domain in0_grad_domain = runtime->get_index_space_domain(
           ctx, task->regions[rid].region.get_index_space());
       assert(in0_domain == in0_grad_domain);
@@ -998,7 +998,7 @@ void ElementBinary::backward_task(Task const *task,
       in1_ptr = helperGetTensorPointerRO<float>(
           regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
       rid++;
-      if (m->trainableInputs[1]) {
+      if (m->trainable_inputs[1]) {
         Domain in1_grad_domain = runtime->get_index_space_domain(
             ctx, task->regions[rid].region.get_index_space());
         // assert(out_grad_domain == in1_domain);
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index 0e1d115557..09cf13c717 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -354,7 +354,7 @@ OpMeta *ElementUnary::init_task(Task const *task,
                                 Runtime *runtime) {
   ElementUnary *eu = (ElementUnary *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  ElementUnaryMeta *m = new ElementUnaryMeta(handle);
+  ElementUnaryMeta *m = new ElementUnaryMeta(handle, eu);
   m->op_type = eu->op_type;
   m->data_type = eu->outputs[0]->data_type;
   // Input and output should have the same data type
@@ -737,7 +737,7 @@ bool ElementUnary::measure_operator_cost(Simulator *sim,
   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
     return false;
   }
-  ElementUnaryMeta *m = sim->ele_unary_meta;
+  ElementUnaryMeta *m = new ElementUnaryMeta(sim->handler, this);
   m->op_type = op_type;
   if (use_cudnn(m->op_type)) {
     Domain input_domain, output_domain;
diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp
index e20200420f..435abdfe11 100644
--- a/src/ops/element_unary.cpp
+++ b/src/ops/element_unary.cpp
@@ -282,7 +282,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m,
                                    stream);
 }
 
-ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) {
+ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary)
+    : OpMeta(handler, unary) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu
index c7f5e90f4c..15e6852388 100644
--- a/src/ops/element_unary.cu
+++ b/src/ops/element_unary.cu
@@ -291,7 +291,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m,
                                    stream);
 }
 
-ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) {
+ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary)
+    : OpMeta(handler, unary) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index e630563b63..95b538bdb6 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -469,7 +469,7 @@ FutureMap Embedding::inference(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
 
-  IndexLauncher launcher(EMBED_FWD_TASK_ID,
+  IndexLauncher launcher(EMBED_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(NULL, 0),
                          argmap,
@@ -559,12 +559,6 @@ void Embedding::forward_task(Task const *task,
   }
   forward_kernel_wrapper(
       m, input, output, kernel, in_dim, out_dim, effective_batch_size);
-  if (m->inference_debugging) {
-    assert(task->index_point.get_dim() == 1);
-    int shard_id = task->index_point.point_data[0];
-    Embedding::save_inference_tensors_to_file(
-        m, shard_id, nullptr, {input}, {kernel}, {output});
-  }
 }
 
 /*
@@ -672,6 +666,16 @@ void Embedding::backward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+Legion::FutureMap
+    Embedding::peft_bwd(FFModel const &ff,
+                        BatchConfigFuture const &bc,
+                        std::vector<ParallelTensor> const &batch_inputs,
+                        std::vector<ParallelTensor> const &batch_outputs,
+                        MachineView const *mv) {
+  // nothing to do (backward function only updates weights)
+  return FutureMap();
+}
+
 void Embedding::backward_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index 8c66f9c7bc..3acc68ed9b 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -589,18 +589,7 @@ OpMeta *Experts::init_task(Task const *task,
                            Runtime *runtime) {
   Experts const *exp = (Experts *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  ExpertsMeta *m = new ExpertsMeta(handle,
-                                   exp->num_experts,
-                                   exp->experts_start_idx,
-                                   exp->data_dim,
-                                   exp->out_dim,
-                                   exp->experts_num_layers,
-                                   exp->experts_internal_dim_size,
-                                   exp->effective_batch_size,
-                                   exp->num_chosen_experts,
-                                   exp->alpha,
-                                   exp->use_bias,
-                                   exp->activation);
+  ExpertsMeta *m = new ExpertsMeta(handle, exp);
   m->profiling = exp->profiling;
   m->inference_debugging = exp->inference_debugging;
   std::strcpy(m->op_name, exp->name);
@@ -682,7 +671,7 @@ FutureMap Experts::inference(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  // int num_active_tokens = bc->num_active_tokens();
+  // int num_active_infr_tokens = bc->num_active_infr_tokens();
   IndexLauncher launcher(EXPERTS_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
@@ -1075,7 +1064,7 @@ void Experts::inference_task(Task const *task,
                                   output_ptr,
                                   weights_ptr,
                                   bias_ptr,
-                                  bc->num_active_tokens(),
+                                  bc->num_active_infr_tokens(),
                                   chosen_experts,
                                   batch_size,
                                   out_dim);
diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp
index c06f02a647..502be878a9 100644
--- a/src/ops/experts.cpp
+++ b/src/ops/experts.cpp
@@ -27,7 +27,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim) {
@@ -35,25 +35,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   handle_unimplemented_hip_kernel(OP_EXPERTS);
 }
 
-ExpertsMeta::ExpertsMeta(FFHandler handler,
-                         int _num_experts,
-                         int _experts_start_idx,
-                         int _data_dim,
-                         int _out_dim,
-                         int _experts_num_layers,
-                         int _experts_internal_dim_size,
-                         int _effective_batch_size,
-                         int _num_chosen_experts,
-                         float _alpha,
-                         bool _use_bias,
-                         ActiMode _activation)
-    : OpMeta(handler), num_experts(_num_experts),
-      experts_start_idx(_experts_start_idx), data_dim(_data_dim),
-      out_dim(_out_dim), experts_num_layers(_experts_num_layers),
-      experts_internal_dim_size(_experts_internal_dim_size),
-      effective_batch_size(_effective_batch_size),
-      num_chosen_experts(_num_chosen_experts), alpha(_alpha),
-      use_bias(_use_bias), activation(_activation) {}
+ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e)
+    : OpMeta(handler, e), num_experts(e->num_experts),
+      experts_start_idx(e->experts_start_idx), data_dim(e->data_dim),
+      out_dim(e->out_dim), experts_num_layers(e->experts_num_layers),
+      experts_internal_dim_size(e->experts_internal_dim_size),
+      effective_batch_size(e->effective_batch_size),
+      num_chosen_experts(e->num_chosen_experts), alpha(e->alpha),
+      use_bias(e->use_bias), activation(e->activation) {}
+
 ExpertsMeta::~ExpertsMeta(void) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/experts.cu b/src/ops/experts.cu
index ce15cdff55..f6f555d1ad 100644
--- a/src/ops/experts.cu
+++ b/src/ops/experts.cu
@@ -515,7 +515,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim) {
@@ -529,8 +529,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
     cudaEventRecord(t_start, stream);
   }
 
-  assert(num_active_tokens > 0);
-  assert(num_active_tokens <= m->effective_batch_size);
+  assert(num_active_infr_tokens > 0);
+  assert(num_active_infr_tokens <= m->effective_batch_size);
   assert(m->effective_batch_size == batch_size);
 
   int num_experts_per_block = m->num_experts;
@@ -540,7 +540,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int data_dim = m->data_dim;
   int num_chosen_experts = m->num_chosen_experts;
   // int num_tokens = m->effective_batch_size;
-  int num_tokens = num_active_tokens;
+  int num_tokens = num_active_infr_tokens;
   int expert_capacity = m->expert_capacity;
 
   assert(chosen_experts == num_chosen_experts);
@@ -579,14 +579,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
 #ifdef INFERENCE_TESTS
   // Checking
   // 1. check that m->sorted_indices contains indices sorted
-  int *indices_cpu = download_tensor<int>(indices, num_indices);
+  int *indices_cpu = copy_tensor_dev_to_host<int>(indices, num_indices);
   // assert(indices_cpu != nullptr);
   std::vector<int> indices_vec(indices_cpu, indices_cpu + num_indices);
   std::vector<int> indices_vec_sorted(indices_vec.size());
   std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin());
   std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end());
 
-  int *thrust_sorted_indices_cpu = download_tensor<int>(
+  int *thrust_sorted_indices_cpu = copy_tensor_dev_to_host<int>(
       m->sorted_indices, m->num_chosen_experts * m->effective_batch_size);
   // assert(thrust_sorted_indices_cpu != nullptr);
   std::vector<int> thrust_sorted_indices_vec(
@@ -613,7 +613,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
     assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]);
   }
   // 2. check that indices[m->original_indices[i]] = i
-  int *thrust_original_indices_cpu = download_tensor<int>(
+  int *thrust_original_indices_cpu = copy_tensor_dev_to_host<int>(
       m->original_indices, m->num_chosen_experts * m->effective_batch_size);
   // assert(thrust_original_indices_cpu != nullptr);
   std::vector<int> thrust_original_indices_vec(
@@ -668,8 +668,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
   assert(non_zero_experts_count == non_zero_experts_check.size());
   // 7. check exp_local_label_to_index
-  int *non_zero_expert_labels_cpu =
-      download_tensor<int>(m->non_zero_expert_labels, non_zero_experts_count);
+  int *non_zero_expert_labels_cpu = copy_tensor_dev_to_host<int>(
+      m->non_zero_expert_labels, non_zero_experts_count);
   // assert(non_zero_expert_labels_cpu != nullptr);
   std::vector<int> non_zero_expert_labels_vec(non_zero_expert_labels_cpu,
                                               non_zero_expert_labels_cpu +
@@ -684,8 +684,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                         non_zero_experts_check_vec.end()));
   assert(non_zero_expert_labels_vec == non_zero_experts_check_vec);
 
-  int *exp_local_label_to_index =
-      download_tensor<int>(m->exp_local_label_to_index, non_zero_experts_count);
+  int *exp_local_label_to_index = copy_tensor_dev_to_host<int>(
+      m->exp_local_label_to_index, non_zero_experts_count);
   // assert(exp_local_label_to_index != nullptr);
   std::vector<int> exp_local_label_to_index_vec(exp_local_label_to_index,
                                                 exp_local_label_to_index +
@@ -699,8 +699,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
 
   // 8. Check expert_start_indexes
-  int *expert_start_indices_thrust =
-      download_tensor<int>(m->expert_start_indexes, non_zero_experts_count + 1);
+  int *expert_start_indices_thrust = copy_tensor_dev_to_host<int>(
+      m->expert_start_indexes, non_zero_experts_count + 1);
   // assert(expert_start_indices_thrust != nullptr);
   std::vector<int> expert_start_indices_thrust_vec(
       expert_start_indices_thrust,
@@ -746,9 +746,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int *num_assignments_per_expert_thrust =
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(num_assignments_per_expert_thrust != nullptr);
-  assert(download_tensor<int>(m->num_assignments_per_expert,
-                              num_assignments_per_expert_thrust,
-                              non_zero_experts_count));
+  assert(copy_tensor_dev_to_host<int>(m->num_assignments_per_expert,
+                                      num_assignments_per_expert_thrust,
+                                      non_zero_experts_count));
   assert(num_assignments_per_expert_thrust != nullptr);
   std::vector<int> num_assignments_per_expert_thrust_vec(
       num_assignments_per_expert_thrust,
@@ -759,9 +759,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int *destination_start_indices_thrust =
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(destination_start_indices_thrust != nullptr);
-  assert(download_tensor<int>(m->destination_start_indices,
-                              destination_start_indices_thrust,
-                              non_zero_experts_count));
+  assert(copy_tensor_dev_to_host<int>(m->destination_start_indices,
+                                      destination_start_indices_thrust,
+                                      non_zero_experts_count));
   assert(destination_start_indices_thrust != nullptr);
   std::vector<int> destination_start_indices_thrust_vec(
       destination_start_indices_thrust,
@@ -1233,25 +1233,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
 }
 
-ExpertsMeta::ExpertsMeta(FFHandler handler,
-                         int _num_experts,
-                         int _experts_start_idx,
-                         int _data_dim,
-                         int _out_dim,
-                         int _experts_num_layers,
-                         int _experts_internal_dim_size,
-                         int _effective_batch_size,
-                         int _num_chosen_experts,
-                         float _alpha,
-                         bool _use_bias,
-                         ActiMode _activation)
-    : OpMeta(handler), num_experts(_num_experts),
-      experts_start_idx(_experts_start_idx), data_dim(_data_dim),
-      out_dim(_out_dim), experts_num_layers(_experts_num_layers),
-      experts_internal_dim_size(_experts_internal_dim_size),
-      effective_batch_size(_effective_batch_size),
-      num_chosen_experts(_num_chosen_experts), alpha(_alpha),
-      use_bias(_use_bias), activation(_activation) {
+ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e)
+    : OpMeta(handler, e), num_experts(e->num_experts),
+      experts_start_idx(e->experts_start_idx), data_dim(e->data_dim),
+      out_dim(e->out_dim), experts_num_layers(e->experts_num_layers),
+      experts_internal_dim_size(e->experts_internal_dim_size),
+      effective_batch_size(e->effective_batch_size),
+      num_chosen_experts(e->num_chosen_experts), alpha(e->alpha),
+      use_bias(e->use_bias), activation(e->activation) {
   expert_capacity =
       ceil(alpha * num_chosen_experts / num_experts * effective_batch_size);
 
diff --git a/src/ops/flat.cc b/src/ops/flat.cc
index 80aedbbb31..e9f637294a 100644
--- a/src/ops/flat.cc
+++ b/src/ops/flat.cc
@@ -187,7 +187,8 @@ OpMeta *Flat::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handler = *((FFHandler const *)task->local_args);
-  FlatMeta *m = new FlatMeta(handler);
+  Flat *flat = (Flat *)task->args;
+  FlatMeta *m = new FlatMeta(handler, flat);
   return m;
 }
 
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 9ad5c4dc9c..121139beb1 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/fused.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
@@ -87,12 +88,32 @@ FusedOp::FusedOp(FFModel &model, Op *op)
     // weights[i]->owner_idx = i;
     weight_data_types[i] = op->weights[i]->data_type;
   }
-  numOutputs = op->numOutputs;
-  for (int i = 0; i < numOutputs; i++) {
-    outputs[i] = op->outputs[i];
-    outputs[i]->owner_op = this;
-    outputs[i]->owner_idx = i;
-    output_data_types[i] = op->outputs[i]->data_type;
+  numOutputs = 0;
+  for (int i = 0; i < op->numOutputs; i++) {
+    bool found = false;
+    // Handle in-place outputs
+    for (int j = 0; j < numInputs; j++) {
+      if (inputs[j]->region == op->outputs[i]->region) {
+        // This output is one of the inputs
+        assert(!found);
+        assert(inputs[j]->region != LogicalRegion::NO_REGION);
+        op_output_source[i] = SOURCE_INPUT;
+        op_input_idx[i] = j;
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      // do nothing
+    } else {
+      outputs[numOutputs] = op->outputs[i];
+      output_data_types[numOutputs] = op->outputs[i]->data_type;
+      op_output_source[i] = SOURCE_OUTPUT;
+      op_output_idx[i] = numOutputs;
+      outputs[numOutputs]->owner_op = this;
+      outputs[numOutputs]->owner_idx = numOutputs;
+      numOutputs++;
+    }
   }
   numOperators = 1;
   op_num_inputs[0] = op->numInputs;
@@ -109,10 +130,53 @@ FusedOp::FusedOp(FFModel &model, Op *op)
     op_weight_source[i] = SOURCE_WEIGHT;
     op_weight_idx[i] = i;
   }
-  for (int i = 0; i < numOutputs; i++) {
-    op_output_source[i] = SOURCE_OUTPUT;
-    op_output_idx[i] = i;
-  }
+  // for (int i = 0; i < numOutputs; i++) {
+  //   op_output_source[i] = SOURCE_OUTPUT;
+  //   op_output_idx[i] = i;
+  // }
+#if 0
+  int input_offset = 0, weight_offset = 0, output_offset = 0;
+  printf("\nNew fused op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: "
+         "#inputs=%i, #outputs=%i, #weights=%i\n",
+         op->name,
+         get_operator_type_name(op->op_type).c_str(),
+         op->numInputs,
+         op->numOutputs,
+         op->numWeights,
+         numInputs,
+         numOutputs,
+         numWeights);
+  printf("op_input_idx:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_idx[i]);
+  }
+  printf("\n");
+  printf("op_input_source:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_source[i]);
+  }
+  printf("\n");
+  printf("op_output_idx:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_idx[i]);
+  }
+  printf("\n");
+  printf("op_output_source:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_source[i]);
+  }
+  printf("\n");
+  printf("op_weight_idx:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_idx[i]);
+  }
+  printf("\n");
+  printf("op_weight_source:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_source[i]);
+  }
+  printf("\n");
+#endif
 }
 
 bool FusedOp::use_same_regions(
@@ -165,7 +229,8 @@ bool FusedOp::add_operator(
   // op->name, op_config));
   // Cannot fuse parallel operators (except allreduce) since they have different
   // paralel_is in forward and backward
-  assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE);
+  assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE ||
+         op->op_type == OP_PARALLEL_IDENTITY);
   // Currently don't consider nested fusion
   assert(op->op_type != OP_FUSED);
   MachineView my_view = outputs[0]->machine_view;
@@ -271,6 +336,18 @@ bool FusedOp::add_operator(
         found = true;
         op_output_source[output_offset + i] = SOURCE_OUTPUT;
         op_output_idx[output_offset + i] = j;
+        break;
+      }
+    }
+    for (int j = 0; j < numInputs; j++) {
+      if (inputs[j]->region == op->outputs[i]->region) {
+        // This input is one of my inputs
+        assert(!found);
+        assert(inputs[j]->region != LogicalRegion::NO_REGION);
+        op_output_source[output_offset + i] = SOURCE_INPUT;
+        op_output_idx[output_offset + i] = j;
+        found = true;
+        break;
       }
     }
     if (found) {
@@ -311,6 +388,50 @@ bool FusedOp::add_operator(
             "Reach to the #outputs limit during fusion.\n"
             "Consider increase MAX_NUM_OUTPUTS to allow more fusions.\n");
   }
+
+#if 0
+  printf("\nAdd op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: "
+         "#inputs=%i, #outputs=%i, #weights=%i\n",
+         op->name,
+         get_operator_type_name(op->op_type).c_str(),
+         op->numInputs,
+         op->numOutputs,
+         op->numWeights,
+         numInputs,
+         numOutputs,
+         numWeights);
+  printf("op_input_idx:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_idx[i]);
+  }
+  printf("\n");
+  printf("op_input_source:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_source[i]);
+  }
+  printf("\n");
+  printf("op_output_idx:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_idx[i]);
+  }
+  printf("\n");
+  printf("op_output_source:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_source[i]);
+  }
+  printf("\n");
+  printf("op_weight_idx:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_idx[i]);
+  }
+  printf("\n");
+  printf("op_weight_source:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_source[i]);
+  }
+  printf("\n");
+#endif
+
   return true;
 }
 
@@ -404,9 +525,13 @@ void FusedOp::init_inference(FFModel const &ff,
     }
     for (int i = 0; i < op_num_outputs[op]; i++) {
       int my_off = op_output_idx[i + ooff];
-      assert(op_output_source[i + ooff] == SOURCE_OUTPUT);
-      assert(my_off < batch_outputs.size());
-      my_batch_outputs.push_back(batch_outputs[my_off]);
+      if (op_output_source[i + ooff] == SOURCE_OUTPUT) {
+        my_batch_outputs.push_back(batch_outputs[my_off]);
+      } else if (op_output_source[i + ooff] == SOURCE_INPUT) {
+        my_batch_outputs.push_back(batch_inputs[my_off]);
+      } else {
+        assert(false);
+      }
     }
     ioff += op_num_inputs[op];
     ooff += op_num_outputs[op];
@@ -526,10 +651,6 @@ FutureMap FusedOp::inference(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   size_t machine_view_hash = view->hash();
-  // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig
-  // so we transfer the maximum of them
-  // size_t batch_config_size =
-  //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
   IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
@@ -571,6 +692,83 @@ FutureMap FusedOp::inference(FFModel const &ff,
                           batch_outputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
+  offset += numOutputs;
+  // add softmax output grad
+  if (operators[numOperators - 1]->op_type == OP_SOFTMAX) {
+    // printf("operator %i is last SOFTMAX! adding grad for output %i\n",
+    //        numOperators - 1,
+    //        numOutputs - 1);
+    assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[numOutputs - 1]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[numOutputs - 1]->region_grad));
+    launcher.add_field(offset, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+FutureMap FusedOp::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  // Set iter_config
+  iter_config = ff.iter_config;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig
+  // so we transfer the maximum of them
+  // size_t batch_config_size =
+  //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
+  IndexLauncher launcher(FUSEDOP_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int offset = 0;
+  for (int i = 0; i < numInputs; i++) {
+    assert(inputs[i]->part != LogicalPartition::NO_PART);
+    assert(inputs[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[i]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_inputs[i]->region_grad));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  offset += numInputs;
+  for (int i = 0; i < numWeights; i++) {
+    assert(weights[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(RegionRequirement(weights[i]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[i]->region));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  offset += numWeights;
+  for (int i = 0; i < numOutputs; i++) {
+    assert(outputs[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[i]->part_grad,
+                          0 /*projection id*/,
+                          i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[i]->region_grad));
+    launcher.add_field(offset + i, FID_DATA);
+  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 3282bc57d9..9f826cd611 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -15,6 +15,7 @@
 
 #include "flexflow/ops/fused.h"
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -30,6 +31,7 @@
 #include "flexflow/ops/kernels/embedding_kernels.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
 #include "flexflow/ops/kernels/reshape_kernels.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
@@ -42,6 +44,7 @@
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -78,17 +81,27 @@ OpMeta *FusedOp::init_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void FusedOp::forward_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
+__host__ void
+    FusedOp::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active tokens
+  if (bc->num_tokens == 0) {
+    return;
+  }
+
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs);
+  bool softmax_grad_additional_region =
+      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
+  assert((int)regions.size() == fused->numInputs + fused->numWeights +
+                                    fused->numOutputs +
+                                    softmax_grad_additional_region);
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
@@ -124,6 +137,7 @@ __host__ void FusedOp::forward_task(Task const *task,
                                          ctx,
                                          runtime);
   }
+  roff += fused->numOutputs;
   // Assert that all meta share the same dnn/blas handler
   int start = 0;
   for (start = 0; start < fused->numOperators; start++) {
@@ -138,11 +152,6 @@ __host__ void FusedOp::forward_task(Task const *task,
     }
   }
 
-  hipStream_t stream;
-  if (start < fused->numOperators) {
-    checkCUDA(get_legion_stream(&stream));
-  }
-
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
@@ -163,8 +172,9 @@ __host__ void FusedOp::forward_task(Task const *task,
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_output_accessor[i] = output_accessor[i + ooff];
+      my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -179,21 +189,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                                 m->legion_axis);
         break;
       }
-      case OP_CONV2D: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
-        Kernels::Conv2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            my_weight_accessor[1].get_float_ptr());
-        break;
-      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -209,16 +204,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
-      case OP_DROPOUT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
-        Kernels::Dropout::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
-        break;
-      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -229,25 +214,48 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        float const *bias_ptr = nullptr;
+        void const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].get_float_ptr();
+            bias_ptr = my_weight_accessor[1].ptr;
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        Kernels::Linear::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            bias_ptr,
-            in_dim,
-            out_dim,
-            batch_size);
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_accessor[0].data_type);
+        batch_size = bc->num_active_infr_tokens();
+        Kernels::Linear::forward_kernel_wrapper(m,
+                                                my_input_accessor[0].ptr,
+                                                my_output_accessor[0].ptr,
+                                                my_weight_accessor[0].ptr,
+                                                bias_ptr,
+                                                in_dim,
+                                                out_dim,
+                                                batch_size);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_accessor[0].domain;
+        Domain output_domain = my_output_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
+        Kernels::LoraLinear::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -375,87 +383,127 @@ __host__ void FusedOp::forward_task(Task const *task,
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU: {
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        ElementUnary::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        if (m->data_type == DT_HALF) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_FLOAT) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else {
+          assert(false && "Unsupported data type in ElementUnary forward");
+        }
         break;
       }
-      case OP_POOL2D: {
+      case OP_RMS_NORM: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
-        Kernels::Pool2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
+        RMSNormMeta *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_weight_accessor[0],
+                                                   my_output_accessor[0]);
         break;
       }
-      case OP_FLAT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Flat::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_input_accessor[1],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            my_output_accessor[1]);
         break;
       }
-      case OP_SOFTMAX: {
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
         }
+        IncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_RESHAPE: {
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Reshape::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        TreeIncMultiHeadSelfAttentionMeta *m =
+            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        TreeVerifyBatchConfig const &tree_bc =
+            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            &tree_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_TRANSPOSE: {
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
-        Kernels::Transpose::forward_kernel_wrapper(
+        SpecIncMultiHeadSelfAttentionMeta const *m =
+            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // BeamSearchBatchConfig const *beam_bc =
+        //     (BeamSearchBatchConfig *)task->args;
+        BeamSearchBatchConfig const &beam_bc =
+            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain,
-            my_output_accessor[0].domain);
+            &beam_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
       case OP_LAYERNORM: {
@@ -477,23 +525,127 @@ __host__ void FusedOp::forward_task(Task const *task,
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator ResidualLayerNorm does not support "
-                        "the forward() task");
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorR residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_accessor[2];
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                    bc,
+                                                    my_input_accessor[0],
+                                                    my_input_accessor[1],
+                                                    residual2,
+                                                    my_output_accessor[0],
+                                                    my_output_accessor[1],
+                                                    gamma,
+                                                    beta);
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator AddBiasResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(false && "Operator ResidualRMSNorm does not support "
-                        "the forward() task");
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+          if (m->use_bias) {
+            beta = my_weight_accessor[2];
+          }
+        }
+        AddBiasResidualLayerNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_input_accessor[1],
+            my_output_accessor[0],
+            my_output_accessor[1],
+            gamma,
+            beta);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
-        assert(false && "Operator SigmoidSiluMulti does not support "
-                        "the forward() task");
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_input_accessor[1],
+                                                   my_output_accessor[0]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        if (op == fused->numOperators - 1) { // if this is the final operator
+          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
+              fused->output_data_types[fused->numOutputs - 1],
+              regions[roff],
+              task->regions[roff],
+              FID_DATA,
+              ctx,
+              runtime);
+        }
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::inference_kernel_wrapper(
+            m,
+            bc,
+            (op == fused->numOperators - 1),
+            my_input_accessor[0],
+            my_output_accessor[0],
+            output_accessor[fused->numOutputs]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       default: {
@@ -503,6 +655,33 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(false && "Fusion currently does not support type");
       }
     }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save);
+    }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
@@ -517,18 +696,525 @@ __host__ void FusedOp::forward_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void
-    FusedOp::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
+__host__ void FusedOp::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
-  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
+  FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  // BatchConfig const *bc = (BatchConfig *)task->args;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
+  // Return if no active PEFT bwd tokens
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
+
+  assert(metas->numOperators == fused->numOperators);
+  assert(regions.size() == task->regions.size());
+  assert((int)regions.size() ==
+         fused->numInputs + fused->numWeights + fused->numOutputs);
+  // Domain input_domain[MAX_NUM_INPUTS];
+  // Domain weight_domain[MAX_NUM_WEIGHTS];
+  // Domain output_domain[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
+  assert(fused->numInputs <= MAX_NUM_INPUTS);
+  for (int i = 0; i < fused->numInputs; i++) {
+    // input_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i].region.get_index_space());
+    input_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->input_data_types[i],
+                                         regions[i],
+                                         task->regions[i],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  int roff = fused->numInputs;
+  assert(fused->numWeights <= MAX_NUM_WEIGHTS);
+  for (int i = 0; i < fused->numWeights; i++) {
+    // weight_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    weight_accessor[i] =
+        helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  roff += fused->numWeights;
+  assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
+  for (int i = 0; i < fused->numOutputs; i++) {
+    // output_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    output_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->output_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  // Assert that all meta share the same dnn/blas handler
+  int start = 0;
+  for (start = 0; start < fused->numOperators; start++) {
+    if (metas->meta[start] != NULL) {
+      break;
+    }
+  }
+  for (int op = start + 1; op < fused->numOperators; op++) {
+    if (metas->meta[op] != NULL) {
+      assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
+    }
+  }
+
+  int ioff = 0, woff = 0, ooff = 0;
+  // Domain my_id[MAX_NUM_INPUTS];
+  // Domain my_wd[MAX_NUM_WEIGHTS];
+  // Domain my_od[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
+
+  // Do backpropagation in the reverse ordering
+  for (int op = 0; op < fused->numOperators; op++) {
+    ioff += fused->op_num_inputs[op];
+    woff += fused->op_num_weights[op];
+    ooff += fused->op_num_outputs[op];
+  }
+
+  for (int op = fused->numOperators - 1; op >= 0; op--) {
+    ioff -= fused->op_num_inputs[op];
+    woff -= fused->op_num_weights[op];
+    ooff -= fused->op_num_outputs[op];
+    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+      int my_off = fused->op_input_idx[i + ioff];
+      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        // my_id[i] = input_domain[my_off];
+        my_input_grad_accessor[i] = input_grad_accessor[my_off];
+      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        // my_id[i] = output_domain[my_off];
+        my_input_grad_accessor[i] = output_grad_accessor[my_off];
+      } else {
+        assert(false);
+      }
+    }
+    for (int i = 0; i < fused->op_num_weights[op]; i++) {
+      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+    }
+    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
+      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
+    }
+    switch (fused->op_op_type[op]) {
+      case OP_CONCAT: {
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        // TODO: implement this
+        assert(false);
+        // ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+        // int num_inputs = fused->op_num_inputs[op];
+        // Kernels::Concat::peft_bwd_kernel_wrapper(m,
+        //                                          my_output_accessor[0],
+        //                                          my_input_accessor,
+        //                                         num_inputs,
+        //                                          m->legion_axis);
+        break;
+      }
+      case OP_BATCHNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_output_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 2);
+        assert(my_weight_accessor[1].domain.get_dim() == 2);
+        // TODO: implement this
+        assert(false);
+        // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+        // BatchNorm::peft_bwd_kernel_kernel(
+        //     m,
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[1].get_float_ptr());
+        break;
+      }
+      case OP_LINEAR: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain kernel_domain = my_weight_accessor[0].domain;
+        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LinearMeta *m = (LinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_grad_accessor[0].data_type);
+        int num_infr_tokens = bc->num_active_infr_tokens();
+        int num_peft_tokens = bc->num_active_peft_tokens();
+        Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 my_input_grad_accessor[0].ptr,
+                                                 my_output_grad_accessor[0].ptr,
+                                                 my_weight_accessor[0].ptr,
+                                                 in_dim,
+                                                 out_dim,
+                                                 num_infr_tokens,
+                                                 num_peft_tokens);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_grad_accessor[0].domain;
+        Domain output_domain = my_output_grad_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_grad_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        Kernels::LoraLinear::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_BATCHMATMUL: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain out_domain = my_output_grad_accessor[0].domain;
+        Domain a_domain = my_input_grad_accessor[0].domain;
+        Domain b_domain = my_input_grad_accessor[1].domain;
+        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+        assert(a_domain.get_dim() == b_domain.get_dim());
+        assert(a_domain.get_dim() == out_domain.get_dim());
+        int batch = 1;
+        for (int i = 2; i < a_domain.get_dim(); i++) {
+          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+          batch *= dim_size;
+        }
+        // TODO: implement me
+        assert(false);
+        // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+        // Kernels::BatchMatmul::backward_kernel_wrapper(
+        //     meta,
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_input_accessor[1].get_float_ptr(),
+        //     (float const *)nullptr,
+        //     m,
+        //     n,
+        //     k,
+        //     batch,
+        //     meta->a_seq_length_dim,
+        //     meta->b_seq_length_dim,
+        //     fused->iter_config.seq_length);
+        break;
+      }
+      case OP_EW_ADD:
+      case OP_EW_SUB:
+      case OP_EW_MUL:
+      case OP_EW_DIV:
+      case OP_EW_MAX:
+      case OP_EW_MIN: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_input_grad_accessor[1].domain);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+        // Kernels::ElementBinary::forward_kernel_wrapper(m,
+        //                                                my_input_accessor[0],
+        //                                                my_input_accessor[1],
+        //                                                my_output_accessor[0]);
+        break;
+      }
+      case OP_EMBEDDING: {
+        // Currently assume the Embedding layer cannot be finetuned
+        // so we do nothing for embedding
+        break;
+      }
+      case OP_GELU:
+      case OP_RELU:
+      case OP_SIGMOID:
+      case OP_TANH:
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // TODO: implement me
+        assert(false);
+        // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        //   if (m->data_type == DT_HALF) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_half_ptr(),
+        //         my_output_accessor[0].get_half_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else if (m->data_type == DT_FLOAT) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_float_ptr(),
+        //         my_output_accessor[0].get_float_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else {
+        //     assert(false && "Unsupported data type in ElementUnary forward");
+        //   }
+        break;
+      }
+      case OP_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_weight_accessor[0]);
+        break;
+      }
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
+            my_weight_accessor[0]);
+        break;
+      }
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_grad_accessor[0],
+            my_weight_accessor[0],
+            my_output_grad_accessor[0],
+            biases);
+        break;
+      }
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+        // TODO: implement me
+        assert(false);
+        break;
+      }
+      case OP_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+        if (m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        LayerNorm::peft_bwd_kernel_wrapper(
+            m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma);
+        break;
+      }
+      case OP_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta const *m =
+            (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorW residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_grad_accessor[2];
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+        }
+        ResidualLayerNorm::peft_bwd_kernel_wrapper(m,
+                                                   my_output_grad_accessor[1],
+                                                   my_input_grad_accessor[0],
+                                                   my_input_grad_accessor[1],
+                                                   residual2,
+                                                   gamma);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta const *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+        }
+
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[1],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            gamma);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_input_grad_accessor[1]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               my_output_grad_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
+    }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_grad_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_grad_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save,
+                                              false);
+    }
+  }
+}
+
+/*
+  regions[...](I): inputs
+  regions[...](I): weights
+  regions[...](O): outputs
+*/
+__host__ void FusedOp::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  // const FusedOp* fused = (FusedOp*) task->args;
+  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
+  FusedOp const *fused = metas->fused_op;
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
   assert((int)regions.size() ==
@@ -582,11 +1268,6 @@ __host__ void
     }
   }
 
-  hipStream_t stream;
-  if (start < fused->numOperators) {
-    checkCUDA(get_legion_stream(&stream));
-  }
-
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
@@ -595,8 +1276,10 @@ __host__ void
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        assert(my_off < fused->numInputs);
         my_input_accessor[i] = input_accessor[my_off];
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        assert(my_off < fused->numOutputs);
         my_input_accessor[i] = output_accessor[my_off];
       } else {
         assert(false);
@@ -604,11 +1287,14 @@ __host__ void
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      assert(fused->op_weight_idx[i + woff] < fused->numWeights);
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_output_accessor[i] = output_accessor[i + ooff];
+      assert(my_off < fused->numOutputs);
+      my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -623,6 +1309,21 @@ __host__ void
                                                 m->legion_axis);
         break;
       }
+      case OP_CONV2D: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 5);
+        assert(my_output_accessor[0].domain.get_dim() == 5);
+        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
+        Kernels::Conv2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            my_weight_accessor[1].get_float_ptr());
+        break;
+      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -638,6 +1339,16 @@ __host__ void
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
+      case OP_DROPOUT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
+        Kernels::Dropout::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
+        break;
+      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -648,27 +1359,25 @@ __host__ void
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        void const *bias_ptr = nullptr;
+        float const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].ptr;
+            bias_ptr = my_weight_accessor[1].get_float_ptr();
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
+        Kernels::Linear::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            bias_ptr,
+            in_dim,
+            out_dim,
+            batch_size);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -796,124 +1505,78 @@ __host__ void
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
+      case OP_ELU: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
+        ElementUnary::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_RMS_NORM: {
+      case OP_POOL2D: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
+        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
+        Kernels::Pool2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
         break;
       }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
+      case OP_FLAT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Flat::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_SOFTMAX: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
-            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::forward_kernel_wrapper(
+            m, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_RESHAPE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &tree_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Reshape::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_TRANSPOSE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
+        Kernels::Transpose::forward_kernel_wrapper(
             m,
-            &beam_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain,
+            my_output_accessor[0].domain);
         break;
       }
       case OP_LAYERNORM: {
@@ -935,119 +1598,23 @@ __host__ void
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
-        if (m->use_two_residuals) {
-          assert(fused->op_num_inputs[op] == 3);
-        } else {
-          assert(fused->op_num_inputs[op] == 2);
-        }
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 0);
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 1); // weight
-          } else {
-            assert(fused->op_num_weights[op] == 2); // weight + bias
-          }
-        }
-        GenericTensorAccessorR residual2;
-        if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
-          }
-        }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
+        assert(false && "Operator ResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
-            (AddBiasResidualLayerNormMeta *)metas->meta[op];
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1); // attn bias
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 2); // attn bias + weight
-          } else {
-            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
-          }
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
-          }
-        }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
+        assert(false && "Operator AddBiasResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
-        break;
-      }
-      case OP_SOFTMAX: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
-        }
+        assert(false && "Operator SigmoidSiluMulti does not support "
+                        "the forward() task");
         break;
       }
-      case OP_ALLREDUCE: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(false && "Operator ResidualRMSNorm does not support "
+                        "the forward() task");
         break;
       }
       default: {
@@ -1176,9 +1743,6 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
   }
 
-  hipStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
   int ioff = 0, woff = 0, ooff = 0;
   GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
@@ -1202,6 +1766,7 @@ __host__ void FusedOp::backward_task(Task const *task,
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
         my_input_accessor[i] = input_accessor[my_off];
         my_input_grad_accessor[i] = input_grad_accessor[my_off];
+        assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
         my_input_accessor[i] = output_accessor[my_off];
         my_input_grad_accessor[i] = output_grad_accessor[my_off];
@@ -1220,9 +1785,9 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]];
-      my_output_grad_accessor[i] =
-          output_grad_accessor[fused->op_output_idx[i + ooff]];
+      int my_off = fused->op_output_idx[i + ooff];
+      my_output_accessor[i] = output_accessor[my_off];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
       assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain);
     }
     switch (fused->op_op_type[op]) {
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 483028599e..cab28181da 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -30,6 +31,7 @@
 #include "flexflow/ops/kernels/embedding_kernels.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
 #include "flexflow/ops/kernels/reshape_kernels.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
@@ -42,6 +44,7 @@
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -77,27 +80,32 @@ OpMeta *FusedOp::init_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void FusedOp::forward_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
+__host__ void
+    FusedOp::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active tokens
+  if (bc->num_tokens == 0) {
+    return;
+  }
+
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs);
-  // Domain input_domain[MAX_NUM_INPUTS];
-  // Domain weight_domain[MAX_NUM_WEIGHTS];
-  // Domain output_domain[MAX_NUM_OUTPUTS];
+  bool softmax_grad_additional_region =
+      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
+  assert((int)regions.size() == fused->numInputs + fused->numWeights +
+                                    fused->numOutputs +
+                                    softmax_grad_additional_region);
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i].region.get_index_space());
     input_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
@@ -109,8 +117,6 @@ __host__ void FusedOp::forward_task(Task const *task,
   int roff = fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -122,8 +128,6 @@ __host__ void FusedOp::forward_task(Task const *task,
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_accessor[i] =
         helperGetGenericTensorAccessorWO(fused->output_data_types[i],
                                          regions[i + roff],
@@ -132,6 +136,7 @@ __host__ void FusedOp::forward_task(Task const *task,
                                          ctx,
                                          runtime);
   }
+  roff += fused->numOutputs;
   // Assert that all meta share the same dnn/blas handler
   int start = 0;
   for (start = 0; start < fused->numOperators; start++) {
@@ -148,36 +153,39 @@ __host__ void FusedOp::forward_task(Task const *task,
 
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
+#if 0
+    std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl;
+#endif
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
     GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
     GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
         my_input_accessor[i] = input_accessor[my_off];
+#if 0
+        printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off);
+#endif
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
         my_input_accessor[i] = output_accessor[my_off];
+#if 0
+        printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off);
+#endif
       } else {
         assert(false);
       }
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      // my_od[i] = output_domain[my_off];
-      // my_op[i] = output_ptr[my_off];
       my_output_accessor[i] = output_accessor[my_off];
+#if 0
+      printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off);
+#endif
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -192,21 +200,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                                 m->legion_axis);
         break;
       }
-      case OP_CONV2D: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
-        Kernels::Conv2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            my_weight_accessor[1].get_float_ptr());
-        break;
-      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -222,16 +215,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
-      case OP_DROPOUT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
-        Kernels::Dropout::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
-        break;
-      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -242,25 +225,48 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        float const *bias_ptr = nullptr;
+        void const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].get_float_ptr();
+            bias_ptr = my_weight_accessor[1].ptr;
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        Kernels::Linear::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            bias_ptr,
-            in_dim,
-            out_dim,
-            batch_size);
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_accessor[0].data_type);
+        batch_size = bc->num_active_infr_tokens();
+        Kernels::Linear::forward_kernel_wrapper(m,
+                                                my_input_accessor[0].ptr,
+                                                my_output_accessor[0].ptr,
+                                                my_weight_accessor[0].ptr,
+                                                bias_ptr,
+                                                in_dim,
+                                                out_dim,
+                                                batch_size);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_accessor[0].domain;
+        Domain output_domain = my_output_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
+        Kernels::LoraLinear::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -388,88 +394,127 @@ __host__ void FusedOp::forward_task(Task const *task,
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU: {
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        ElementUnary::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        if (m->data_type == DT_HALF) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_FLOAT) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else {
+          assert(false && "Unsupported data type in ElementUnary forward");
+        }
         break;
       }
-      case OP_POOL2D: {
+      case OP_RMS_NORM: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        // assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
-        Kernels::Pool2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
+        RMSNormMeta *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_weight_accessor[0],
+                                                   my_output_accessor[0]);
         break;
       }
-      case OP_FLAT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Flat::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_input_accessor[1],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            my_output_accessor[1]);
         break;
       }
-      case OP_SOFTMAX: {
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
         }
+        IncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_RESHAPE: {
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Reshape::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        TreeIncMultiHeadSelfAttentionMeta *m =
+            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        TreeVerifyBatchConfig const &tree_bc =
+            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            &tree_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_TRANSPOSE: {
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
-        Kernels::Transpose::forward_kernel_wrapper(
+        SpecIncMultiHeadSelfAttentionMeta const *m =
+            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // BeamSearchBatchConfig const *beam_bc =
+        //     (BeamSearchBatchConfig *)task->args;
+        BeamSearchBatchConfig const &beam_bc =
+            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain,
-            my_output_accessor[0].domain);
+            &beam_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
       case OP_LAYERNORM: {
@@ -491,39 +536,694 @@ __host__ void FusedOp::forward_task(Task const *task,
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator ResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator AddBiasResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_SIGMOID_SILU_MULTI: {
-        assert(false && "Operator SigmoidSiluMulti does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(false && "Operator ResidualRMSNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      default: {
-        fprintf(stderr,
-                "Fusion currently does not support type = %d\n",
-                fused->op_op_type[op]);
-        assert(false && "Fusion currently does not support type");
-      }
-    }
-    ioff += fused->op_num_inputs[op];
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorR residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_accessor[2];
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                    bc,
+                                                    my_input_accessor[0],
+                                                    my_input_accessor[1],
+                                                    residual2,
+                                                    my_output_accessor[0],
+                                                    my_output_accessor[1],
+                                                    gamma,
+                                                    beta);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+          if (m->use_bias) {
+            beta = my_weight_accessor[2];
+          }
+        }
+        AddBiasResidualLayerNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_input_accessor[1],
+            my_output_accessor[0],
+            my_output_accessor[1],
+            gamma,
+            beta);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_input_accessor[1],
+                                                   my_output_accessor[0]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        if (op == fused->numOperators - 1) { // if this is the final operator
+          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
+              fused->output_data_types[fused->numOutputs - 1],
+              regions[roff],
+              task->regions[roff],
+              FID_DATA,
+              ctx,
+              runtime);
+        }
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::inference_kernel_wrapper(
+            m,
+            bc,
+            (op == fused->numOperators - 1),
+            my_input_accessor[0],
+            my_output_accessor[0],
+            output_accessor[fused->numOutputs]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
+    }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save);
+    }
+    ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
   }
-  // for (int i = 0; i < fused->numOutputs; i++)
-  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-  //   "[Fused:forward:output]");
+  // for (int i = 0; i < fused->numOutputs; i++)
+  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+  //   "[Fused:forward:output]");
+}
+
+/*
+  regions[...](I): inputs
+  regions[...](I): weights
+  regions[...](O): outputs
+*/
+__host__ void FusedOp::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  // const FusedOp* fused = (FusedOp*) task->args;
+  FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
+  FusedOp const *fused = metas->fused_op;
+  // BatchConfig const *bc = (BatchConfig *)task->args;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active PEFT bwd tokens
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  assert(metas->numOperators == fused->numOperators);
+  assert(regions.size() == task->regions.size());
+  assert((int)regions.size() ==
+         fused->numInputs + fused->numWeights + fused->numOutputs);
+  // Domain input_domain[MAX_NUM_INPUTS];
+  // Domain weight_domain[MAX_NUM_WEIGHTS];
+  // Domain output_domain[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
+  assert(fused->numInputs <= MAX_NUM_INPUTS);
+  for (int i = 0; i < fused->numInputs; i++) {
+    // input_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i].region.get_index_space());
+    input_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->input_data_types[i],
+                                         regions[i],
+                                         task->regions[i],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  int roff = fused->numInputs;
+  assert(fused->numWeights <= MAX_NUM_WEIGHTS);
+  for (int i = 0; i < fused->numWeights; i++) {
+    // weight_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    weight_accessor[i] =
+        helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  roff += fused->numWeights;
+  assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
+  for (int i = 0; i < fused->numOutputs; i++) {
+    // output_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    output_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->output_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  // Assert that all meta share the same dnn/blas handler
+  int start = 0;
+  for (start = 0; start < fused->numOperators; start++) {
+    if (metas->meta[start] != NULL) {
+      break;
+    }
+  }
+  for (int op = start + 1; op < fused->numOperators; op++) {
+    if (metas->meta[op] != NULL) {
+      assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
+    }
+  }
+
+  int ioff = 0, woff = 0, ooff = 0;
+  // Domain my_id[MAX_NUM_INPUTS];
+  // Domain my_wd[MAX_NUM_WEIGHTS];
+  // Domain my_od[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
+
+  // Do backpropagation in the reverse ordering
+  for (int op = 0; op < fused->numOperators; op++) {
+    ioff += fused->op_num_inputs[op];
+    woff += fused->op_num_weights[op];
+    ooff += fused->op_num_outputs[op];
+  }
+
+  for (int op = fused->numOperators - 1; op >= 0; op--) {
+#if 0
+    std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl;
+#endif
+    ioff -= fused->op_num_inputs[op];
+    woff -= fused->op_num_weights[op];
+    ooff -= fused->op_num_outputs[op];
+    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+      int my_off = fused->op_input_idx[i + ioff];
+      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        // my_id[i] = input_domain[my_off];
+        my_input_grad_accessor[i] = input_grad_accessor[my_off];
+#if 0
+        printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off);
+#endif
+      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        // my_id[i] = output_domain[my_off];
+        my_input_grad_accessor[i] = output_grad_accessor[my_off];
+#if 0
+        printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off);
+#endif
+      } else {
+        assert(false);
+      }
+    }
+    for (int i = 0; i < fused->op_num_weights[op]; i++) {
+      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+    }
+    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
+      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
+#if 0
+      printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off);
+#endif
+    }
+    switch (fused->op_op_type[op]) {
+      case OP_CONCAT: {
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        // TODO: implement this
+        assert(false);
+        // ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+        // int num_inputs = fused->op_num_inputs[op];
+        // Kernels::Concat::peft_bwd_kernel_wrapper(m,
+        //                                          my_output_accessor[0],
+        //                                          my_input_accessor,
+        //                                         num_inputs,
+        //                                          m->legion_axis);
+        break;
+      }
+      case OP_BATCHNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_output_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 2);
+        assert(my_weight_accessor[1].domain.get_dim() == 2);
+        // TODO: implement this
+        assert(false);
+        // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+        // BatchNorm::peft_bwd_kernel_kernel(
+        //     m,
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[1].get_float_ptr());
+        break;
+      }
+      case OP_LINEAR: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain kernel_domain = my_weight_accessor[0].domain;
+        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LinearMeta *m = (LinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_grad_accessor[0].data_type);
+        int num_infr_tokens = bc->num_active_infr_tokens();
+        int num_peft_tokens = bc->num_active_peft_tokens();
+        Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 my_input_grad_accessor[0].ptr,
+                                                 my_output_grad_accessor[0].ptr,
+                                                 my_weight_accessor[0].ptr,
+                                                 in_dim,
+                                                 out_dim,
+                                                 num_infr_tokens,
+                                                 num_peft_tokens);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_grad_accessor[0].domain;
+        Domain output_domain = my_output_grad_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_grad_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        Kernels::LoraLinear::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_BATCHMATMUL: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain out_domain = my_output_grad_accessor[0].domain;
+        Domain a_domain = my_input_grad_accessor[0].domain;
+        Domain b_domain = my_input_grad_accessor[1].domain;
+        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+        assert(a_domain.get_dim() == b_domain.get_dim());
+        assert(a_domain.get_dim() == out_domain.get_dim());
+        int batch = 1;
+        for (int i = 2; i < a_domain.get_dim(); i++) {
+          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+          batch *= dim_size;
+        }
+        // TODO: implement me
+        assert(false);
+        // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+        // Kernels::BatchMatmul::backward_kernel_wrapper(
+        //     meta,
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_input_accessor[1].get_float_ptr(),
+        //     (float const *)nullptr,
+        //     m,
+        //     n,
+        //     k,
+        //     batch,
+        //     meta->a_seq_length_dim,
+        //     meta->b_seq_length_dim,
+        //     fused->iter_config.seq_length);
+        break;
+      }
+      case OP_EW_ADD:
+      case OP_EW_SUB:
+      case OP_EW_MUL:
+      case OP_EW_DIV:
+      case OP_EW_MAX:
+      case OP_EW_MIN: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_input_grad_accessor[1].domain);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+        // Kernels::ElementBinary::forward_kernel_wrapper(m,
+        //                                                my_input_accessor[0],
+        //                                                my_input_accessor[1],
+        //                                                my_output_accessor[0]);
+        break;
+      }
+      case OP_EMBEDDING: {
+        // Currently assume the Embedding layer cannot be finetuned
+        // so we do nothing for embedding
+        break;
+      }
+      case OP_GELU:
+      case OP_RELU:
+      case OP_SIGMOID:
+      case OP_TANH:
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // TODO: implement me
+        assert(false);
+        // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        //   if (m->data_type == DT_HALF) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_half_ptr(),
+        //         my_output_accessor[0].get_half_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else if (m->data_type == DT_FLOAT) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_float_ptr(),
+        //         my_output_accessor[0].get_float_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else {
+        //     assert(false && "Unsupported data type in ElementUnary forward");
+        //   }
+        break;
+      }
+      case OP_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_weight_accessor[0]);
+        break;
+      }
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
+            my_weight_accessor[0]);
+        break;
+      }
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_grad_accessor[0],
+            my_weight_accessor[0],
+            my_output_grad_accessor[0],
+            biases);
+        break;
+      }
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+        // TODO: implement me
+        assert(false);
+        break;
+      }
+      case OP_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+        if (m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        LayerNorm::peft_bwd_kernel_wrapper(
+            m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma);
+        break;
+      }
+      case OP_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta const *m =
+            (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorW residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_grad_accessor[2];
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+        }
+        ResidualLayerNorm::peft_bwd_kernel_wrapper(m,
+                                                   my_output_grad_accessor[1],
+                                                   my_input_grad_accessor[0],
+                                                   my_input_grad_accessor[1],
+                                                   residual2,
+                                                   gamma);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta const *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+        }
+
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[1],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            gamma);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_input_grad_accessor[1]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               my_output_grad_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
+    }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_grad_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_grad_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save,
+                                              false);
+    }
+  }
 }
 
 /*
@@ -531,35 +1231,22 @@ __host__ void FusedOp::forward_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void
-    FusedOp::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
+__host__ void FusedOp::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
-  FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
+  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  // BatchConfig const *bc = (BatchConfig *)task->args;
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  // Return if no active tokens
-  if (bc->num_tokens == 0) {
-    return;
-  }
-
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
   assert((int)regions.size() ==
          fused->numInputs + fused->numWeights + fused->numOutputs);
-  // Domain input_domain[MAX_NUM_INPUTS];
-  // Domain weight_domain[MAX_NUM_WEIGHTS];
-  // Domain output_domain[MAX_NUM_OUTPUTS];
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i].region.get_index_space());
     input_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
@@ -571,8 +1258,6 @@ __host__ void
   int roff = fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -584,8 +1269,6 @@ __host__ void
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_accessor[i] =
         helperGetGenericTensorAccessorWO(fused->output_data_types[i],
                                          regions[i + roff],
@@ -610,20 +1293,15 @@ __host__ void
 
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
     GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
     GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
         assert(my_off < fused->numInputs);
         my_input_accessor[i] = input_accessor[my_off];
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
         assert(my_off < fused->numOutputs);
         my_input_accessor[i] = output_accessor[my_off];
       } else {
@@ -632,8 +1310,6 @@ __host__ void
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
       assert(fused->op_weight_idx[i + woff] < fused->numWeights);
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
@@ -641,8 +1317,6 @@ __host__ void
       int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       assert(my_off < fused->numOutputs);
-      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
-      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
       my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
@@ -658,6 +1332,21 @@ __host__ void
                                                 m->legion_axis);
         break;
       }
+      case OP_CONV2D: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 5);
+        assert(my_output_accessor[0].domain.get_dim() == 5);
+        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
+        Kernels::Conv2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            my_weight_accessor[1].get_float_ptr());
+        break;
+      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -673,6 +1362,16 @@ __host__ void
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
+      case OP_DROPOUT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
+        Kernels::Dropout::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
+        break;
+      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -683,27 +1382,25 @@ __host__ void
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        void const *bias_ptr = nullptr;
+        float const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].ptr;
+            bias_ptr = my_weight_accessor[1].get_float_ptr();
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
+        Kernels::Linear::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            bias_ptr,
+            in_dim,
+            out_dim,
+            batch_size);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -831,126 +1528,78 @@ __host__ void
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
+      case OP_ELU: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        ElementUnary::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
+        break;
+      }
+      case OP_POOL2D: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
+        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
+        Kernels::Pool2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
         break;
       }
-      case OP_RMS_NORM: {
+      case OP_FLAT: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Flat::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_SOFTMAX: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
-            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::forward_kernel_wrapper(
+            m, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_RESHAPE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeVerifyBatchConfig const *tree_bc =
-        //     (TreeVerifyBatchConfig *)task->args;
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &tree_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Reshape::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_TRANSPOSE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
+        Kernels::Transpose::forward_kernel_wrapper(
             m,
-            &beam_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain,
+            my_output_accessor[0].domain);
         break;
       }
       case OP_LAYERNORM: {
@@ -972,119 +1621,23 @@ __host__ void
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
-        if (m->use_two_residuals) {
-          assert(fused->op_num_inputs[op] == 3);
-        } else {
-          assert(fused->op_num_inputs[op] == 2);
-        }
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 0);
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 1); // weight
-          } else {
-            assert(fused->op_num_weights[op] == 2); // weight + bias
-          }
-        }
-        GenericTensorAccessorR residual2;
-        if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
-          }
-        }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
+        assert(false && "Operator ResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
-            (AddBiasResidualLayerNormMeta *)metas->meta[op];
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1); // attn bias
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 2); // attn bias + weight
-          } else {
-            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
-          }
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
-          }
-        }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
+        assert(false && "Operator AddBiasResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
-        break;
-      }
-      case OP_SOFTMAX: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
-        }
+        assert(false && "Operator SigmoidSiluMulti does not support "
+                        "the forward() task");
         break;
       }
-      case OP_ALLREDUCE: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(false && "Operator ResidualRMSNorm does not support "
+                        "the forward() task");
         break;
       }
       default: {
@@ -1094,37 +1647,6 @@ __host__ void
         assert(false && "Fusion currently does not support type");
       }
     }
-    if (metas->meta[op]->inference_debugging) {
-      std::vector<GenericTensorAccessorR> input_accessors_to_save;
-      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-      std::vector<GenericTensorAccessorR> output_accessors_to_save;
-      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          input_accessors_to_save.push_back(input_accessor[my_off]);
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          input_accessors_to_save.push_back(output_accessor[my_off]);
-        } else {
-          assert(false);
-        }
-      }
-      for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(
-            weight_accessor[fused->op_weight_idx[i + woff]]);
-      }
-      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        output_accessors_to_save.push_back(output_accessor[i + ooff]);
-      }
-      assert(task->index_point.get_dim() == 1);
-      int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                              shard_id,
-                                              bc,
-                                              input_accessors_to_save,
-                                              weight_accessors_to_save,
-                                              output_accessors_to_save);
-    }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
@@ -1156,9 +1678,6 @@ __host__ void FusedOp::backward_task(Task const *task,
     int sum = fused->numInputs + fused->numWeights + fused->numOutputs;
     assert(sum * 2 == (int)regions.size());
   }
-  // Domain input_domain[MAX_NUM_INPUTS], input_grad_domain[MAX_NUM_INPUTS];
-  // Domain weight_domain[MAX_NUM_WEIGHTS], weight_grad_domain[MAX_NUM_WEIGHTS];
-  // Domain output_domain[MAX_NUM_OUTPUTS], output_grad_domain[MAX_NUM_OUTPUTS];
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
@@ -1168,8 +1687,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   int roff = 0;
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i].region.get_index_space());
     input_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
@@ -1181,8 +1698,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   roff += fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -1194,8 +1709,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->output_data_types[i],
                                          regions[i + roff],
@@ -1206,8 +1719,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
   roff += fused->numOutputs;
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_grad_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     input_grad_accessor[i] =
         helperGetGenericTensorAccessorRW(fused->input_data_types[i],
                                          regions[i + roff],
@@ -1219,8 +1730,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
   roff += fused->numInputs;
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_grad_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_grad_accessor[i] =
         helperGetGenericTensorAccessorRW(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -1233,8 +1742,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
   roff += fused->numWeights;
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_grad_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_grad_accessor[i] =
         helperGetGenericTensorAccessorRW(fused->output_data_types[i],
                                          regions[i + roff],
@@ -1260,9 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
 
   int ioff = 0, woff = 0, ooff = 0;
-  // Domain my_id[MAX_NUM_INPUTS], my_grad_id[MAX_NUM_INPUTS];
-  // Domain my_wd[MAX_NUM_WEIGHTS], my_grad_wd[MAX_NUM_WEIGHTS];
-  // Domain my_od[MAX_NUM_OUTPUTS], my_grad_od[MAX_NUM_OUTPUTS];
   GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS];
@@ -1283,19 +1787,11 @@ __host__ void FusedOp::backward_task(Task const *task,
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
-        // my_ip[i] = input_ptr[my_off];
         my_input_accessor[i] = input_accessor[my_off];
-        // my_grad_id[i] = input_grad_domain[my_off];
-        // my_grad_ip[i] = input_grad_ptr[my_off];
         my_input_grad_accessor[i] = input_grad_accessor[my_off];
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
-        // my_ip[i] = output_ptr[my_off];
         my_input_accessor[i] = output_accessor[my_off];
-        // my_grad_id[i] = output_grad_domain[my_off];
-        // my_grad_ip[i] = output_grad_ptr[my_off];
         my_input_grad_accessor[i] = output_grad_accessor[my_off];
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else {
@@ -1304,11 +1800,7 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-      // my_grad_wd[i] = weight_grad_domain[fused->op_weight_idx[i + woff]];
-      // my_grad_wp[i] = weight_grad_ptr[fused->op_weight_idx[i + woff]];
       my_weight_grad_accessor[i] =
           weight_grad_accessor[fused->op_weight_idx[i + woff]];
       assert(my_weight_grad_accessor[i].domain.get_volume() ==
@@ -1317,11 +1809,7 @@ __host__ void FusedOp::backward_task(Task const *task,
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       int my_off = fused->op_output_idx[i + ooff];
-      // my_od[i] = output_domain[my_off];
-      // my_op[i] = output_ptr[my_off];
       my_output_accessor[i] = output_accessor[my_off];
-      // my_grad_od[i] = output_grad_domain[my_off];
-      // my_grad_op[i] = output_grad_ptr[my_off];
       my_output_grad_accessor[i] = output_grad_accessor[my_off];
       assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain);
     }
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index f2f402737c..03b9a5199b 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -99,7 +99,7 @@ Group_byParams Group_by::get_params() const {
   Group_byParams params;
   params.n = this->n;
   params.alpha = this->alpha;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -271,7 +271,7 @@ OpMeta *Group_by::init_task(Task const *task,
                             Runtime *runtime) {
   Group_by *gb = (Group_by *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha);
+  GroupByMeta *m = new GroupByMeta(handle, gb);
   m->profiling = gb->profiling;
   m->inference_debugging = gb->inference_debugging;
   std::strcpy(m->op_name, gb->name);
@@ -579,7 +579,7 @@ bool Group_by::measure_operator_cost(Simulator *sim,
     }
   }
 
-  GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha);
+  GroupByMeta *m = new GroupByMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp
index 761c35f182..9ca6f77898 100644
--- a/src/ops/group_by.cpp
+++ b/src/ops/group_by.cpp
@@ -188,9 +188,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m,
                      data_dim);
 }
 
-GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha)
-    : OpMeta(handler), alpha(_alpha) {
-  checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *)));
+GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb)
+    : OpMeta(handler, gb), alpha(gb->alpha) {
+  checkCUDA(hipMalloc(&dev_region_ptrs, gb->n * sizeof(float *)));
 }
 GroupByMeta::~GroupByMeta(void) {
   checkCUDA(hipFree(&dev_region_ptrs));
diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu
index 0ed09e20b3..43bcb900df 100644
--- a/src/ops/group_by.cu
+++ b/src/ops/group_by.cu
@@ -198,9 +198,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m,
   }
 }
 
-GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha)
-    : OpMeta(handler), alpha(_alpha) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
+GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb)
+    : OpMeta(handler, gb), alpha(gb->alpha) {
+  checkCUDA(cudaMalloc(&dev_region_ptrs, gb->n * sizeof(float *)));
 }
 GroupByMeta::~GroupByMeta(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index aa60d0f19c..8219cf9e1f 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -363,7 +363,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         dims,
         quantization_type == DT_NONE ? this->data_type : quantization_type,
         nullptr /*owner_op*/,
-        true /*create_grad*/,
+        model.config.computationMode == COMP_MODE_INFERENCE
+            ? false
+            : true /*create_grad*/,
         initializer,
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
@@ -871,6 +873,139 @@ void IncMultiHeadSelfAttention::inference_task(
   }
 }
 
+FutureMap IncMultiHeadSelfAttention::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  int idx = 0;
+  IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(idx++, FID_DATA);
+  if (qkv_bias || final_bias) {
+    launcher.add_region_requirement(
+        RegionRequirement(weights[1]->part,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          weights[1]->region,
+                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+    launcher.add_field(idx++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): input
+  regions[3](I): weight
+  regions[4](O): output
+*/
+void IncMultiHeadSelfAttention::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
+                    bc->num_tokens,
+                    bc->num_active_requests());
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  IncMultiHeadSelfAttentionMeta *m =
+      *((IncMultiHeadSelfAttentionMeta **)task->local_args);
+
+  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
+                                           : regions.size() == 3));
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR biases;
+  if (*m->qkv_bias || *m->final_bias) {
+    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    Domain bias_domain = runtime->get_index_space_domain(
+        ctx, task->regions[3].region.get_index_space());
+    assert(bias_domain.get_dim() == 4);
+  }
+
+  Domain input_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain weight_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  Domain output_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[2].region.get_index_space());
+
+  assert(input_grad_domain.get_dim() == 4);
+  assert(weight_domain.get_dim() == 2);
+  assert(output_grad_domain.get_dim() == 4);
+
+  assert(task->index_point.get_dim() == 1);
+
+  IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+      m,
+      bc,
+      task->index_point.point_data[0],
+      input_grad,
+      weight,
+      output_grad,
+      biases);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    IncMultiHeadSelfAttention::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
+}
+
 void IncMultiHeadSelfAttention::backward(FFModel const &ff) {
   // IncMultiHeadSelfAttention does not support backward
   assert(false);
@@ -926,7 +1061,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
   params.num_kv_heads = this->num_kv_heads;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
 
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d60386f927..826fea4347 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -12,13 +12,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/hip_helper.h"
-#include <hip/hip_complex.h>
+#include "hip/hip_complex.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
@@ -27,9 +27,288 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 
+#define WARP_SIZE 32
+
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+// gridDim = num_heads
+// blockDim = num_tokens/num_request * head_size
+// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
+// one thread process one head_size
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_attention_kernel_generation_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int max_seq_length,
+    int per_head_size,
+    int hidden_size,
+    BatchConfig::PerRequestInfo *request_infos) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
+  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
+  //  K_ELTS_PER_THREAD = 128 / 4 = 32
+  //  K_VECS_PER_THREAD = 32 / 1 = 32
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
+  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // request idx
+  int const request_idx = blockIdx.y;
+
+  int const batch_config_request_id =
+      request_infos[request_idx].batch_config_request_id;
+
+  int const first_step = 0;
+
+  int const tlength =
+      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+  // DT const *q_ptr =
+  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
+
+  // q tensor in this thread
+  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
+  // K_VECS_PER_THREAD elements
+  // QK_vec_k: 32->1, 64->2, 128->4... head_size
+  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+#pragma unroll
+  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
+  }
+  __syncthreads();
+  // first iter = 128 / 4 = 32
+  // K_VECS_PER_THREAD = 32
+  //  K_PER_ITER how many keys in this loop
+  //  The number of timesteps loaded per iteration.
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  //   // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+  // get k, perform qk proj
+
+  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+    K_vec k[K_VECS_PER_THREAD];
+    int const ti_circ = ti % max_seq_length;
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+      if (ti < tlength) {
+        k[ii] = *reinterpret_cast<K_vec const *>(k_cache_batch +
+                                                 ti_circ * hidden_size +
+                                                 head_idx * per_head_size + jj);
+      }
+      // Compute dot product.
+      // This includes a reduction across the threads in the same thread group.
+    }
+    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+    // // todo add positional embedding to the qk production
+    // // Store the product to shared memory. There's one qk value per
+    // timestep.
+    // // Update the max.
+    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+      // todo add alobi here
+      bool const mask = ti_circ >= tlength;
+      if (mask) {
+        assert(false);
+      }
+      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+      qk_smem[ti - first_step] = mask ? 0.f : qk;
+    }
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+    qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+  }
+
+  // Decompose the thread index into warp and lane.
+  int const warp = tidx / WARP_SIZE;
+  int const lane = tidx % WARP_SIZE;
+
+  // The warp leader writes the max to shared memory.
+  if (lane == 0) {
+    red_smem[warp] = qk_max;
+  }
+
+  // Make sure the products are in shared memory.
+  __syncthreads();
+
+  // The warps finalize the reduction.
+  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+  }
+
+  // Broadcast to all the threads in the warp.
+  qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0);
+
+  float exp_sum = 0.f;
+  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+    float logit = __expf(qk_smem[ti - first_step] - qk_max);
+    exp_sum += logit;
+    qk_smem[ti - first_step] = logit;
+  }
+
+  // Compute the sum.
+  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+  // softmax
+  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+    qk_smem[ti - first_step] *= inv_sum;
+  }
+
+  __syncthreads();
+  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
+  //   printf("softmax %.10f\n", qk_smem[0]);
+  // }
+
+  // value projection
+  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+  // A vector of V elements for the current timestep.
+  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+  // The value computed by this thread.
+  int vo = tidx / THREADS_PER_VALUE;
+  // The hidden dimensions computed by this particular thread.
+  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+  Out_sum out;
+  zero(out);
+
+  // The base pointer for the value in the cache buffer.
+  DT const *v_cache_batch =
+      value_cache + batch_config_request_id * max_seq_length * hidden_size + vi;
+
+  if (Dh == Dh_MAX || vi < Dh) {
+    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+      // Load the values from the cache.
+      int const ti_circ = ti % max_seq_length;
+
+      V_vec v = *reinterpret_cast<V_vec const *>(
+          v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+      float logit = qk_smem[ti - first_step];
+      out = FlexFlow::fma(logit, cast_to_float(v), out);
+    }
+  }
+
+  //   // Make sure we can start writing to shared memory.
+  __syncthreads();
+
+  // Run the final reduction amongst the different groups computing different
+  // partial outputs.
+  if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+    for (int active_groups = V_PER_ITER; active_groups >= 2;
+         active_groups /= 2) {
+
+      // The midpoint in the number of active groups.
+      int midpoint = active_groups / 2;
+
+      // The upper part of active threads store to shared memory.
+      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+            out;
+      }
+      __syncthreads();
+
+      // The bottom warps update their values.
+      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                  out);
+      }
+      __syncthreads();
+    }
+  }
+
+  // Output the final values.
+  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+    convert_from_float(
+        *reinterpret_cast<V_vec *>(output_ptr + request_idx * hidden_size +
+                                   head_idx * per_head_size + vi),
+        out);
+  }
+}
+
 // only used by MPT model. https://arxiv.org/abs/2108.12409
 template <typename DT>
 __global__ void apply_position_bias_qkprd(DT *input_ptr,
@@ -86,8 +365,10 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
     // int qkv_index = i / (num_tokens * qProjSize) % 3;
 
     int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
-    size_t in_token_idx = i - token_idx * hidden_size * 3;
+    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
+
     int qkv_index = in_token_idx / hidden_size;
+
     int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
 
     int head_idx =
@@ -109,6 +390,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
     }
   }
 }
+
 template <typename DT>
 __global__ void scaling_query_kernel(DT *input_ptr,
                                      int qProjSize,
@@ -158,6 +440,10 @@ __global__ void
     int token_idx =
         (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    // float before_real = complex_input[i].x, before_complex =
+    // complex_input[i].y;
+
     int pos_i = real_i % (proj_size / 2);
     float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
@@ -189,7 +475,7 @@ __global__ void
     int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
 
     int real_part_index = idx + head_idx * proj_size +
-                          token_idx * hidden_size * 3 +
+                          token_idx * hidden_size * QKV_WEIGHT_NUM +
                           hidden_size * (q_tensor ? 0 : 1);
     int complex_part_index = real_part_index + (proj_size / 2);
 
@@ -217,28 +503,59 @@ __global__ void
 }
 
 template <typename DT>
-__global__ void store_kv_cache(DT const *devQKVProjArray,
-                               DT *kCache_ptr,
-                               DT *vCache_ptr,
+__global__ void
+    apply_rotary_embedding_bwd(DT *input_ptr,
+                               hipFloatComplex *complex_input,
                                BatchConfig::PerTokenInfo const *tokenInfos,
+                               int proj_size,
                                int num_tokens,
-                               int max_seq_len,
                                int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-    size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset;
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
 
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
+                             head_idx * num_tokens * proj_size +
+                             idx * num_tokens + token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
 
-    // key cache
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    hipFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = hipCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
   }
 }
 
@@ -254,56 +571,68 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
-  hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
-  // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
-  // Weights: qSize x qProjSize x 3 x num_q_heads
-  // Input: qSize x num_tokens
-  // Output >>> qProjSize x num_tokens x 3 x num_q_heads
-  int m_q = m->qProjSize * m->num_q_heads;
-  int m_k = m->kProjSize * m->num_q_heads;
-  int m_v = m->vProjSize * m->num_q_heads;
-  assert(m_q == m_k && m_k == m_v); // keep things simple for now
-  int n = bc->num_active_tokens();
-  int k = m->qSize;
-  int m_ = m_q * QKV_WEIGHT_NUM;
-  int lda = k, ldb = k, ldc = m_;
-  checkCUDA(hipblasGemmEx(m->handle.blas,
-                          HIPBLAS_OP_T,
-                          HIPBLAS_OP_N,
-                          m_,
-                          n,
-                          k,
-                          &alpha,
-                          weight_ptr,
-                          hipblas_data_type,
-                          lda,
-                          input_ptr,
-                          hipblas_data_type,
-                          ldb,
-                          &beta,
-                          output_ptr,
-                          hipblas_data_type,
-                          ldc,
-                          compute_type,
-                          HIPBLAS_GEMM_DEFAULT));
-
-  // apply rotary emmmbedding for q and k
-  // step1 change the k, v to complex tensor
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  hipblasDatatype_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+
+  // Step 1: Compute QKV projections
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_q = m->qProjSize * m->num_q_heads;
+    int m_k = m->kProjSize * m->num_q_heads;
+    int m_v = m->vProjSize * m->num_q_heads;
+    assert(m_q == m_k && m_k == m_v); // keep things simple for now
+    int n = bc->num_active_infr_tokens();
+    int k = m->qSize;
+    int m_ = m_q * QKV_WEIGHT_NUM;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: QKV weights
+    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
+    // matrix B: input
+    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
+    // matrix C: devQKVProjArray
+    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            m_,
+                            n,
+                            k,
+                            &alpha,
+                            weight_ptr,
+                            cublas_data_type,
+                            lda,
+                            input_ptr,
+                            cublas_data_type,
+                            ldb,
+                            &beta,
+                            output_ptr,
+                            cublas_data_type,
+                            ldc,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
-  // apply bias for q, k, v
+
+  // Step 2: apply bias for QKV, or scale the query
   if (*m->qkv_bias) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -321,7 +650,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->scaling_factor,
                        m->hidden_size);
   } else if (m->scaling_query) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -333,10 +662,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->scaling_factor,
                        m->hidden_size);
   }
+
+  // Step 3: apply rotary embedding if needed
   if (*m->apply_rotary_embedding) {
     /*q&k*/
     parallelism = num_tokens * m->hidden_size;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -352,14 +683,42 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
+template <typename DT>
+__global__ void store_kv_cache(DT const *devQKVProjArray,
+                               DT *kCache_ptr,
+                               DT *vCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int num_tokens,
+                               int max_seq_len,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+    int const req_id = tokenInfos[token_idx].request_index;
+    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+
+    // key cache
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = vVal;
+  }
+}
+
 template <typename DT>
 void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             BatchConfig const *bc,
                             hipStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * num_tokens;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -374,6 +733,129 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
+template <typename DT>
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         hipStream_t stream) {
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  hipblasDatatype_t compute_type = HIPBLAS_R_16F;
+#else
+  hipblasDatatype_t compute_type = cublas_data_type;
+#endif
+  // Project to output, save result directly on output tensor
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_ = m->oProjSize;
+    int k = m->vProjSize * m->num_q_heads;
+    int n = num_tokens;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: output projection weight
+    // matrix A's layout: [vProjSize * num_heads, oProjSize]
+    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                           m->kProjSize * m->num_q_heads +
+                                           m->vProjSize * m->num_q_heads);
+    // matrix B: attn heads
+    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+    DT const *B = static_cast<DT *>(m->attn_heads);
+    // matrix B: output
+    // matrix B's layout: [oProjSize, num_new_tokens]
+    DT *C = static_cast<DT *>(output_ptr);
+
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            m_,
+                            n,
+                            k,
+                            &alpha,
+                            A,
+                            cublas_data_type,
+                            lda,
+                            B,
+                            cublas_data_type,
+                            ldb,
+                            &beta,
+                            C,
+                            cublas_data_type,
+                            ldc,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+  // Add final output bias
+  if (*m->final_bias && shard_id == 0) {
+    int parallelism = m->oProjSize * num_tokens;
+    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+                          m->kProjSize * m->global_num_q_heads +
+                          m->vProjSize * m->global_num_q_heads;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w),
+                       GET_BLOCKS(parallelism),
+                       min(CUDA_NUM_THREADS, parallelism),
+                       0,
+                       stream,
+                       output_ptr,
+                       bias_ptr,
+                       num_tokens,
+                       qkv_weight_size,
+                       m->oProjSize);
+  }
+}
+
+#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
+  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
+                                   BatchConfig::max_sequence_length(),         \
+                                   THREADS_PER_VALUE,                          \
+                                   THDS_PER_BLOCK);                            \
+  compute_attention_kernel_generation_kernel<DT,                               \
+                                             THDS_PER_BLOCK,                   \
+                                             Dh,                               \
+                                             Dh_MAX,                           \
+                                             THDS_PER_KEY,                     \
+                                             THREADS_PER_VALUE>                \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
+          static_cast<DT *>(m->devQKVProjArray),                               \
+          static_cast<DT *>(m->keyCache),                                      \
+          static_cast<DT *>(m->valueCache),                                    \
+          output_ptr,                                                          \
+          scale,                                                               \
+          BatchConfig::max_sequence_length(),                                  \
+          m->qProjSize,                                                        \
+          m->hidden_size,                                                      \
+          m->request_infos)
+
+template <typename DT>
+void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
+                                         BatchConfig const *bc,
+                                         DT *output_ptr,
+                                         hipStream_t stream) {
+  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  size_t smem_sz;
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+  } else {
+    assert(false && "a unsupported head size");
+  }
+}
+
 template <typename DT>
 void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
@@ -393,27 +875,29 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
     if (m->quantization_type == DT_INT4) {
       int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
-      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int4_attention_weights),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         m->qProjSize,
+                         m->qSize,
+                         m->num_q_heads);
     } else {
       assert(m->quantization_type == DT_INT8);
       int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
-      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int8_attention_weights),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         m->qProjSize,
+                         m->qSize,
+                         m->num_q_heads);
     }
   } else {
     if (data_type == DT_FLOAT) {
@@ -435,7 +919,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -443,19 +927,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       hipStream_t stream) {
-  // here because we need postion info in infernece 1
 
   if (m->offload && m->biasSize > 0) {
     checkCUDA(hipMemcpyAsync(
         m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
-  checkCUDA(hipMemcpyAsync(m->token_infos,
-                           &(bc->tokensInfo),
-                           bc->num_active_tokens() *
-                               sizeof(BatchConfig::PerTokenInfo),
-                           hipMemcpyHostToDevice,
-                           stream));
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
@@ -465,14 +943,520 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
                      stream);
-
-  // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
 
-  // phase 3: Compute attention score
-  // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  compute_attention_kernel(
-      m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+  if (bc->num_generation_tokens > 0) {
+    // phase 3: Compute attention score for generation tokens
+    compute_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  }
+
+  if (bc->num_tokens > bc->num_generation_tokens) {
+    // phase 4: Compute attention score for prompt tokens;
+    compute_attention_kernel_prompt(
+        m, bc, shard_id, bias_ptr, weight_ptr, stream);
+  }
+
+  // compute output production and bias together for all tokens
+  int num_tokens = bc->num_active_tokens();
+  compute_o_prod_bias(
+      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+}
+
+std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                                int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
+}
+
+template <typename DT>
+void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                     BatchConfig const *bc,
+                     int shard_id,
+                     DT *input_grad_ptr,
+                     DT const *weight_ptr,
+                     DT const *output_grad_ptr,
+                     DT const *bias_ptr,
+                     hipStream_t stream) {
+  assert(!m->offload);
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  hipblasDatatype_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].num_tokens_in_batch;
+    // Currently assume we are calculating gradients for all tokens
+    // of a request
+    assert(num_tokens == num_total_tokens);
+    int kt_block_size = m->kProjSize;
+    int kt_req_block_size =
+        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    int vt_block_size = m->vProjSize;
+    int vt_req_block_size =
+        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+    // Step 1: compute gradients before final projection
+    {
+      int m_ = m->vProjSize * m->num_q_heads;
+      int n_ = num_tokens;
+      int k_ = m->oProjSize;
+      int lda = m_;
+      int ldb = k_;
+      int ldc = m_;
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: output projection weight
+      // matrix A's layout: [vProjSize * num_heads, oProjSize]
+      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                             m->kProjSize * m->num_q_heads +
+                                             m->vProjSize * m->num_q_heads);
+      // matrix B: output gradients
+      // matrix B's layout: [oProjSize, num_new_tokens]
+      DT const *B =
+          output_grad_ptr +
+          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
+      // matrix C: attn_heads gradients
+      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
+      DT *C = static_cast<DT *>(m->handle.workSpace);
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_N,
+                              m_,
+                              n_,
+                              k_,
+                              &alpha,
+                              A,
+                              cublas_data_type,
+                              lda,
+                              B,
+                              cublas_data_type,
+                              ldb,
+                              &beta,
+                              C,
+                              cublas_data_type,
+                              ldc,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0";
+        save_tensor(C, m_ * n_, filename.c_str());
+      }
+    }
+    // Step 2: compute gradients w.r.t. value
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: qk_prods_softmax
+      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
+      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) +
+              2 * num_tokens *
+                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
+                                                   // for Q and K gradients
+      // after transpositions
+      int m_ = num_tokens;   // total_tokens
+      int n_ = m->vProjSize; // num_new_tokens
+      int k_ = num_tokens;   // num_new_tokens
+      // before transpositions
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // total_tokens
+      // N.B. strides are applied before transpose operations
+      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * m->vProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      // save result to file for checking
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0";
+        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax";
+        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
+      }
+    }
+    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: value cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = num_tokens;
+      int k_ = m->vProjSize;
+      // before transposition and striding
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // num_new_tokens
+      int strideA = m->vProjSize;
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
+
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_N,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+        std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache";
+        save_tensor(
+            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
+      }
+    }
+    // Step 4: softmax backpropagation
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      int n_param = m->num_q_heads;
+      int c_param = num_tokens;
+      int h_param = 1;
+      int w_param = num_tokens;
+      checkCUDNN(miopenSet4dTensorDescriptor(
+          m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param));
+      checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn,
+                                          &alpha,
+                                          m->qk_tensor,
+                                          m->softmax_activation_buffer,
+                                          m->qk_tensor,
+                                          m->qk_prods_softmax,
+                                          &beta,
+                                          m->qk_tensor,
+                                          m->qk_prods,
+                                          MIOPEN_SOFTMAX_ACCURATE,
+                                          MIOPEN_SOFTMAX_MODE_CHANNEL));
+
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+
+      //  TODO: fill all elements above diagonal to force causal attention
+      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
+      if (entries_above_diagonal > 0) {
+        size_t parallelism = m->num_q_heads * entries_above_diagonal;
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal),
+                           GET_BLOCKS(parallelism),
+                           min((size_t)CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream,
+                           static_cast<DT *>(m->qk_prods),
+                           num_tokens,
+                           num_tokens,
+                           m->num_q_heads,
+                           entries_above_diagonal,
+                           DT(0.0f));
+      }
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename = get_peft_dbg_folder(m, shard_id) +
+                               ".qk_prods.softmax_grad_in.masked";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+    }
+    // Step 5: compute gradients w.r.t. key
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: query activation (in query_activation_buffer)
+      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->query_activation_buffer);
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->kProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->kProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->kProjSize;
+      int strideC = num_tokens * m->kProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".query_activation";
+        save_tensor(
+            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".devkproj_pre";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
+      }
+    }
+    // Step 6: compute gradients w.r.t query
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: key cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = m->qProjSize;
+      int k_ = num_tokens;
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->qProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_N,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
+    }
+
+    // Step 7: perform rotary position embeddings (RoPE) bwd
+    {
+      if (*m->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_bwd),
+                           GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream,
+                           A,
+                           m->complex_input,
+                           m->token_infos,
+                           m->qProjSize,
+                           num_tokens,
+                           m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
+      }
+    }
+
+    // Step 8: compute gradients w.r.t. input
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix A: QKV projection weights
+      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
+      DT const *A = weight_ptr;
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [m->qSize, num_tokens]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int lda = m_;
+      int ldb = n_;
+      int ldc = m_;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_T,
+                              m_,
+                              n_,
+                              k_,
+                              &alpha,
+                              A,
+                              cublas_data_type,
+                              lda,
+                              B,
+                              cublas_data_type,
+                              ldb,
+                              &beta,
+                              C,
+                              cublas_data_type,
+                              ldc,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
+    }
+  }
 }
 
 } // namespace IncMultiHeadAttention
@@ -481,42 +1465,47 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
 using namespace Kernels::IncMultiHeadAttention;
 
 template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+
+    DT qVal = devQKVProjArray[val_idx];
+
+    // query cache
+    qCache_ptr[i] = qVal;
   }
 }
 
 template <typename DT>
-void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                              BatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              hipStream_t stream) {
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
+                                     BatchConfig const *bc,
+                                     int shard_id,
+                                     DT const *bias_ptr,
+                                     DT const *weight_ptr,
+                                     hipStream_t stream) {
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
-  hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+  hipblasDatatype_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
@@ -530,64 +1519,102 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    // bc->token_last_available_idx[i] + 1;
-    // Compute (QK^T/sqrt(d_k))
-    // a flag of using this scaling alpha
-    int m_ = num_new_tokens;
-    int n = total_tokens;
-    int k = m->qProjSize;
-    int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-        ldc = m_;
-    int strideA = q_block_size;
-    int strideB = kt_block_size;
-    int strideC = num_new_tokens * total_tokens;
-    DT alpha = 1.0f, beta = 0.0f;
-    if (*m->qk_prod_scaling) {
-      alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
+      }
+      int parallelism = m->hidden_size * num_tokens;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         static_cast<DT *>(m->devQKVProjArray),
+                         static_cast<DT *>(m->query_activation_buffer),
+                         num_tokens,
+                         m->hidden_size);
     }
-    // To get A, skip over Q entries from previous requests (same head)
-    DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                  tokens_previous_requests * m->qProjSize * m->num_q_heads *
-                      QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-    DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-    // To get C, skip over QK^T products from previous requests
+    // Step 1: compute query-key product QK.T/sqrt(d_k)
+    {
+      // Scale by sqrt(d_k) as per the original attention paper
+      DT alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+      }
+      // after transpositions
+      int m_ = num_new_tokens;
+      int n = total_tokens;
+      int k = m->qProjSize;
+      // before transpositions
+      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
+          ldc = m_;
+      // N.B. strides are applied before transpose operations
+      int strideA = q_block_size;
+      int strideB = kt_block_size;
+      int strideC = num_new_tokens * total_tokens;
+
+      // matrix A: devQKVProjArray
+      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
+      // To get query projection, skip over Q entries from previous requests
+      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                    bc->requestsInfo[i].first_token_offset_in_batch *
+                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
+      // matrix B: key cache
+      // matrix B's layout: [kProjSize * num_heads, total_tokens]
+      // To get B, skip over K entries from previous requests (all heads +
+      // padding)
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: qk_prods
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get C, skip over QK.T products from previous requests
+      DT *C = static_cast<DT *>(m->qk_prods);
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_N,
+                                            m_,
+                                            n,
+                                            k,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+    }
+    // Step 2: Add alibi position bias to qk production
+    // matrix C: qk_prods
+    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+    // To get C, skip over QK.T products from previous requests
     DT *C = static_cast<DT *>(m->qk_prods);
-    checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                          HIPBLAS_OP_T,
-                                          HIPBLAS_OP_N,
-                                          m_,
-                                          n,
-                                          k,
-                                          &alpha,
-                                          A,
-                                          hipblas_data_type,
-                                          lda,
-                                          strideA,
-                                          B,
-                                          hipblas_data_type,
-                                          ldb,
-                                          strideB,
-                                          &beta,
-                                          C,
-                                          hipblas_data_type,
-                                          ldc,
-                                          strideC,
-                                          m->num_q_heads,
-                                          compute_type,
-                                          HIPBLAS_GEMM_DEFAULT));
-
-    // add alibi position bias to qk production
     if (*m->position_bias) {
       size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd<DT>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd),
                          GET_BLOCKS(parallelism),
                          min((size_t)CUDA_NUM_THREADS, parallelism),
                          0,
@@ -599,13 +1626,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
                          m->global_num_q_heads,
                          shard_id);
     }
-    // Fill all elements above diagonal in qk prods with -inf to force
-    // causal attention.
+
+    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
+    // with -inf to force causal attention.
     assert(num_new_tokens <= total_tokens);
     size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
     if (entries_above_diagonal > 0) {
       size_t parallelism = m->num_q_heads * entries_above_diagonal;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal<DT>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal),
                          GET_BLOCKS(parallelism),
                          min((size_t)CUDA_NUM_THREADS, parallelism),
                          0,
@@ -617,137 +1645,129 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
                          entries_above_diagonal,
                          static_cast<DT>(-INFINITY));
     }
-    // Compute Softmax(QK^T/sqrt(d_k))
-    // Before modifying the parameters below, make sure to read the following
-    // description of the CUDNN_TENSOR_NCHW tensor layout, from
-    // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-    // This tensor format specifies that the data is laid out in the following
-    // order: batch size, feature maps, rows, columns. The strides are
-    // implicitly defined in such a way that the data are contiguous in memory
-    // with no padding between images, feature maps, rows, and columns; the
-    // columns are the inner dimension and the images are the outermost
-    // dimension.
-    int n_param = m->num_q_heads;
-    int c_param = total_tokens;
-    int h_param = 1;
-    int w_param = num_new_tokens;
-    checkCUDNN(miopenSet4dTensorDescriptor(
-        m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param));
-    float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-    DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-    // The softmax operation below is executed according to the
-    // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-    // softmax operation is computed per spatial location (H,W) per image (N)
-    // across dimension C.
-    checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
-                                       &softmax_alpha,
-                                       m->qk_tensor,
-                                       C,
-                                       &softmax_beta,
-                                       m->qk_tensor,
-                                       C_softmax,
-                                       MIOPEN_SOFTMAX_ACCURATE,
-                                       MIOPEN_SOFTMAX_MODE_CHANNEL));
-    // Matmul softmax(QK^T/sqrt(d_k)) by V
-    alpha = 1.0f, beta = 0.0f;
-    m_ = num_new_tokens;
-    n = m->vProjSize;
-    k = total_tokens;
-    lda = m_, ldb = n * m->num_q_heads, ldc = m_;
-    strideA = num_new_tokens * total_tokens;
-    strideB = vt_block_size;
-    strideC = num_new_tokens * m->vProjSize;
-    // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-    // requests (all heads)
-    A = C_softmax;
-    // To get B, skip over V^T entries from previous requests (all heads +
-    // padding)
-    B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-    // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-    // requests
-    C = static_cast<DT *>(m->attn_heads) +
-        tokens_previous_requests * m->num_q_heads * m->vProjSize;
-
-    checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                          HIPBLAS_OP_N,
-                                          HIPBLAS_OP_T,
-                                          m_,
-                                          n,
-                                          k,
-                                          &alpha,
-                                          A,
-                                          hipblas_data_type,
-                                          lda,
-                                          strideA,
-                                          B,
-                                          hipblas_data_type,
-                                          ldb,
-                                          strideB,
-                                          &beta,
-                                          C,
-                                          hipblas_data_type,
-                                          ldc,
-                                          strideC,
-                                          m->num_q_heads,
-                                          compute_type,
-                                          HIPBLAS_GEMM_DEFAULT));
-    // Project to output, save result directly on output tensor
-    alpha = 1.0f, beta = 0.0f;
-    m_ = m->oProjSize;
-    k = m->vProjSize * m->num_q_heads;
-    n = num_new_tokens;
-    lda = k, ldb = n, ldc = m_;
-    A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                 m->kProjSize * m->num_q_heads +
-                                 m->vProjSize * m->num_q_heads);
-    B = C;
-    C = static_cast<DT *>(output_ptr) + tokens_previous_requests * m->oProjSize;
 
-    checkCUDA(hipblasGemmEx(m->handle.blas,
-                            HIPBLAS_OP_T,
-                            HIPBLAS_OP_T,
-                            m_,
-                            n,
-                            k,
-                            &alpha,
-                            A,
-                            hipblas_data_type,
-                            lda,
-                            B,
-                            hipblas_data_type,
-                            ldb,
-                            &beta,
-                            C,
-                            hipblas_data_type,
-                            ldc,
-                            compute_type,
-                            HIPBLAS_GEMM_DEFAULT));
+    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
+    {
+      // Before modifying the parameters below, make sure to read the following
+      // description of the HIPDNN_TENSOR_NCHW tensor layout, from
+      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t:
+      // This tensor format specifies that the data is laid out in the following
+      // order: batch size, feature maps, rows, columns. The strides are
+      // implicitly defined in such a way that the data are contiguous in memory
+      // with no padding between images, feature maps, rows, and columns; the
+      // columns are the inner dimension and the images are the outermost
+      // dimension.
+      int n_param = m->num_q_heads;
+      int c_param = total_tokens;
+      int h_param = 1;
+      int w_param = num_new_tokens;
+      checkCUDNN(miopenSet4dTensorDescriptor(
+          m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param));
+      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      // The softmax operation below is executed according to the
+      // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+      // softmax operation is computed per spatial location (H,W) per image (N)
+      // across dimension C.
+      checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
+                                         &softmax_alpha,
+                                         m->qk_tensor,
+                                         C,
+                                         &softmax_beta,
+                                         m->qk_tensor,
+                                         C_softmax,
+                                         MIOPEN_SOFTMAX_ACCURATE,
+                                         MIOPEN_SOFTMAX_MODE_CHANNEL));
+    }
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
+      }
+      checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer,
+                               C_softmax,
+                               sizeof(DT) * total_tokens * num_new_tokens *
+                                   m->num_q_heads,
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
+    // softmax(QK.T/sqrt(d_k)).T
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      // after transpositions
+      int m_ = m->vProjSize;
+      int n = num_new_tokens;
+      int k = total_tokens;
+      // before transpositions
+      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      // N.B. strides are applied before transpose operations
+      int strideA = vt_block_size;
+      int strideB = num_new_tokens * total_tokens;
+      int strideC = m->vProjSize;
+      // matrix A: value cache
+      // matrix A's layout: [vProjSize, num_heads, total_tokens]
+      // To get A, skip over V.T entries from previous requests (all heads +
+      // padding)
+      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix B: qk_prods_softmax
+      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      DT *B = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix C: attn heads
+      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
+      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
+      // requests
+      // store the result attn heads, also skip the genration tokens
+      DT *C = static_cast<DT *>(m->attn_heads) +
+              (bc->requestsInfo[i].first_token_offset_in_batch) *
+                  m->num_q_heads * m->vProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_N,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n,
+                                            k,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+    }
     tokens_previous_requests += num_new_tokens;
   }
-
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       num_tokens,
-                       qkv_weight_size,
-                       m->oProjSize);
+  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+    bc->print();
+    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
+    printf("num_tokens: %i\n", num_tokens);
+    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
   }
-
-  assert(tokens_previous_requests == num_tokens);
+  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
 }
 
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -813,10 +1833,71 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
     checkCUDA(hipEventDestroy(t_start));
     checkCUDA(hipEventDestroy(t_end));
-    printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed);
-    // print_tensor<3, float>(acc_query.ptr, acc_query.rect,
-    // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr,
-    // acc_output.rect, "[Attention:forward:output]");
+    printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+    IncMultiHeadSelfAttentionMeta *m,
+    BatchConfig const *bc,
+    int shard_id,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &bias) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  bool use_bias = *m->qkv_bias || *m->final_bias;
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  // assert(input.data_type == weight.data_type);
+  assert(input_grad.data_type == output_grad.data_type);
+  if (use_bias) {
+    assert(input_grad.data_type == bias.data_type);
+  }
+
+  if (input_grad.data_type == DT_HALF) {
+    assert(!m->offload);
+    half const *bias_ptr =
+        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_half_ptr(),
+                                                    weight.get_half_ptr(),
+                                                    output_grad.get_half_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else if (input_grad.data_type == DT_FLOAT) {
+    assert(!m->offload);
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_float_ptr(),
+                                                    weight.get_float_ptr(),
+                                                    output_grad.get_float_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed);
   }
 }
 
@@ -895,7 +1976,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   assert(kSize == vSize);
   qProjSize = _qProjSize;
   kProjSize = _kProjSize;
-  assert(qProjSize == kProjSize); // required for attention QK^T matmul
+  assert(qProjSize == kProjSize); // required for attention QK.T matmul
   vProjSize = _vProjSize;
   oProjSize = _oProjSize;
   size_t size_of_dt = data_type_size(attn->data_type);
@@ -949,14 +2030,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE
+                                   ? BatchConfig::max_verify_tokens_per_batch()
+                                   : BatchConfig::max_tokens_per_batch();
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
     size_t key_cache_size = 0, value_cache_size = 0;
     switch (infer_mode) {
-      case INC_DECODING_MODE:
-      case TREE_VERIFY_MODE: {
+      case INC_DECODING_MODE: {
         key_cache_size = num_q_heads * kProjSize *
                          BatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length();
@@ -965,21 +2047,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_sequence_length();
         break;
       }
-      case BEAM_SEARCH_MODE: {
+      case BEAM_SEARCH_MODE:
+      case TREE_VERIFY_MODE: {
+        // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
                          BeamSearchBatchConfig::max_requests_per_batch() *
-                         BatchConfig::max_sequence_length() *
-                         BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                         (BatchConfig::max_sequence_length() +
+                          BatchConfig::max_spec_tree_token_num());
         value_cache_size = num_q_heads * vProjSize *
                            BeamSearchBatchConfig::max_requests_per_batch() *
-                           BatchConfig::max_sequence_length() *
-                           BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                           (BatchConfig::max_sequence_length() +
+                            BatchConfig::max_spec_tree_token_num());
         break;
       }
       default:
         assert(false && "Unkown inference mode");
     }
-    size_t tokeninfo_size = max_tokens_per_batch;
+    size_t requestinfo_size = BatchConfig::max_requests_per_batch();
+    // size_t tokeninfo_size = max_tokens_per_batch;
     size_t qk_prod_size =
         max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads;
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
@@ -990,7 +2075,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         (qkv_max_proj_size + key_cache_size + value_cache_size +
          2 * qk_prod_size + attn_heads_size) *
             size_of_dt +
-        tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) +
         complex_size * sizeof(hipFloatComplex); // more components will
                                                 // be added here later
     if (offload) {
@@ -1035,10 +2119,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
                                                              size_of_dt);
 
+    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
+        handler.batch_config_metadata->tokens_info);
+    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
+        handler.batch_config_metadata->requestsInfo);
+
     if (offload) {
-      token_infos =
-          gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
-              tokeninfo_size);
+      // token_infos =
+      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
+      //         tokeninfo_size);
       // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size;
       qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
                                                              size_of_dt);
@@ -1052,10 +2141,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       complex_input =
           gpu_mem_allocator.allocate_reserved<hipFloatComplex>(complex_size);
       // offset += complex_size * sizeof(hipFloatComplex);
+      // request_infos =
+      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerRequestInfo>(
+      //         requestinfo_size);
     } else {
-      token_infos =
-          gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
-              tokeninfo_size);
+      // token_infos =
+      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
+      //         tokeninfo_size);
       qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
@@ -1064,6 +2156,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                                size_of_dt);
       complex_input =
           gpu_mem_allocator.allocate_instance<hipFloatComplex>(complex_size);
+      // request_infos =
+      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerRequestInfo>(
+      //         requestinfo_size);
     }
 
     // allocate more size for quantization data
@@ -1077,6 +2172,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
   checkCUDA(hipStreamSynchronize(stream));
 }
 
@@ -1098,4 +2195,37 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
     DataType data_type,
     hipStream_t stream);
 
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float *output_ptr,
+    float const *weight_ptr,
+    float const *bias_ptr,
+    int num_tokens,
+    hipStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half *output_ptr,
+    half const *weight_ptr,
+    half const *bias_ptr,
+    int num_tokens,
+    hipStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        float *output_ptr,
+        hipStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        half *output_ptr,
+        hipStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index a0d31bb6ef..b278611b60 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -12,9 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
-#endif
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
@@ -483,6 +481,63 @@ __global__ void
   }
 }
 
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_bwd(DT *input_ptr,
+                               cuFloatComplex *complex_input,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int proj_size,
+                               int num_tokens,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
+
+    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
+                             head_idx * num_tokens * proj_size +
+                             idx * num_tokens + token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+  }
+}
+
 template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
@@ -497,17 +552,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
 
   // Step 1: Compute QKV projections
   {
@@ -517,7 +573,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int m_k = m->kProjSize * m->num_q_heads;
     int m_v = m->vProjSize * m->num_q_heads;
     assert(m_q == m_k && m_k == m_v); // keep things simple for now
-    int n = bc->num_active_tokens();
+    int n = bc->num_active_infr_tokens();
     int k = m->qSize;
     int m_ = m_q * QKV_WEIGHT_NUM;
     // before transpositions
@@ -604,7 +660,7 @@ template <typename DT>
 void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             BatchConfig const *bc,
                             cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * num_tokens;
     store_kv_cache<<<GET_BLOCKS(parallelism),
@@ -799,7 +855,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -843,6 +899,504 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
 }
 
+std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                                int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
+}
+
+template <typename DT>
+void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                     BatchConfig const *bc,
+                     int shard_id,
+                     DT *input_grad_ptr,
+                     DT const *weight_ptr,
+                     DT const *output_grad_ptr,
+                     DT const *bias_ptr,
+                     cudaStream_t stream) {
+  assert(!m->offload);
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  cudaDataType_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].num_tokens_in_batch;
+    // Currently assume we are calculating gradients for all tokens
+    // of a request
+    assert(num_tokens == num_total_tokens);
+    int kt_block_size = m->kProjSize;
+    int kt_req_block_size =
+        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    int vt_block_size = m->vProjSize;
+    int vt_req_block_size =
+        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+    // Step 1: compute gradients before final projection
+    {
+      int m_ = m->vProjSize * m->num_q_heads;
+      int n_ = num_tokens;
+      int k_ = m->oProjSize;
+      int lda = m_;
+      int ldb = k_;
+      int ldc = m_;
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: output projection weight
+      // matrix A's layout: [vProjSize * num_heads, oProjSize]
+      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                             m->kProjSize * m->num_q_heads +
+                                             m->vProjSize * m->num_q_heads);
+      // matrix B: output gradients
+      // matrix B's layout: [oProjSize, num_new_tokens]
+      DT const *B =
+          output_grad_ptr +
+          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
+      // matrix C: attn_heads gradients
+      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
+      DT *C = static_cast<DT *>(m->handle.workSpace);
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             m_,
+                             n_,
+                             k_,
+                             &alpha,
+                             A,
+                             cublas_data_type,
+                             lda,
+                             B,
+                             cublas_data_type,
+                             ldb,
+                             &beta,
+                             C,
+                             cublas_data_type,
+                             ldc,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0";
+        save_tensor(C, m_ * n_, filename.c_str());
+      }
+    }
+    // Step 2: compute gradients w.r.t. value
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: qk_prods_softmax
+      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
+      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) +
+              2 * num_tokens *
+                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
+                                                   // for Q and K gradients
+      // after transpositions
+      int m_ = num_tokens;   // total_tokens
+      int n_ = m->vProjSize; // num_new_tokens
+      int k_ = num_tokens;   // num_new_tokens
+      // before transpositions
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // total_tokens
+      // N.B. strides are applied before transpose operations
+      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * m->vProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0";
+        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax";
+        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
+      }
+    }
+    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: value cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = num_tokens;
+      int k_ = m->vProjSize;
+      // before transposition and striding
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // num_new_tokens
+      int strideA = m->vProjSize;
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
+
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+        std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache";
+        save_tensor(
+            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
+      }
+    }
+    // Step 4: softmax backpropagation
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      int n_param = m->num_q_heads;
+      int c_param = num_tokens;
+      int h_param = 1;
+      int w_param = num_tokens;
+      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
+                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_data_type,
+                                            n_param,
+                                            c_param,
+                                            h_param,
+                                            w_param));
+      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      m->qk_tensor,
+                                      m->softmax_activation_buffer,
+                                      m->qk_tensor,
+                                      m->qk_prods_softmax,
+                                      &beta,
+                                      m->qk_tensor,
+                                      m->qk_prods));
+
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+
+      //  TODO: fill all elements above diagonal to force causal attention
+      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
+      if (entries_above_diagonal > 0) {
+        size_t parallelism = m->num_q_heads * entries_above_diagonal;
+        fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
+                                      min((size_t)CUDA_NUM_THREADS,
+                                          parallelism),
+                                      0,
+                                      stream>>>(static_cast<DT *>(m->qk_prods),
+                                                num_tokens,
+                                                num_tokens,
+                                                m->num_q_heads,
+                                                entries_above_diagonal,
+                                                DT(0.0f));
+      }
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename = get_peft_dbg_folder(m, shard_id) +
+                               ".qk_prods.softmax_grad_in.masked";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+    }
+    // Step 5: compute gradients w.r.t. key
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: query activation (in query_activation_buffer)
+      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->query_activation_buffer);
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->kProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->kProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->kProjSize;
+      int strideC = num_tokens * m->kProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".query_activation";
+        save_tensor(
+            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".devkproj_pre";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
+      }
+    }
+    // Step 6: compute gradients w.r.t query
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: key cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = m->qProjSize;
+      int k_ = num_tokens;
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->qProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
+    }
+
+    // Step 7: perform rotary position embeddings (RoPE) bwd
+    {
+      if (*m->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
+                                     min(CUDA_NUM_THREADS, parallelism),
+                                     0,
+                                     stream>>>(A,
+                                               m->complex_input,
+                                               m->token_infos,
+                                               m->qProjSize,
+                                               num_tokens,
+                                               m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
+      }
+    }
+
+    // Step 8: compute gradients w.r.t. input
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix A: QKV projection weights
+      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
+      DT const *A = weight_ptr;
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [m->qSize, num_tokens]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int lda = m_;
+      int ldb = n_;
+      int ldc = m_;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
+                             m_,
+                             n_,
+                             k_,
+                             &alpha,
+                             A,
+                             cublas_data_type,
+                             lda,
+                             B,
+                             cublas_data_type,
+                             ldb,
+                             &beta,
+                             C,
+                             cublas_data_type,
+                             ldc,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
+    }
+  }
+}
+
 } // namespace IncMultiHeadAttention
 } // namespace Kernels
 
@@ -877,24 +1431,25 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
 }
 
 template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+
+    DT qVal = devQKVProjArray[val_idx];
+
+    // query cache
+    qCache_ptr[i] = qVal;
   }
 }
 
 template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                      BatchConfig const *bc,
                                      int shard_id,
                                      DT const *bias_ptr,
@@ -905,17 +1460,18 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
@@ -929,12 +1485,35 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
+      }
+      int parallelism = m->hidden_size * num_tokens;
+      store_query_cache<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(
+          static_cast<DT *>(m->devQKVProjArray),
+          static_cast<DT *>(m->query_activation_buffer),
+          num_tokens,
+          m->hidden_size);
+    }
     // Step 1: compute query-key product QK.T/sqrt(d_k)
     {
       // Scale by sqrt(d_k) as per the original attention paper
@@ -1066,6 +1645,25 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
                                      m->qk_tensor,
                                      C_softmax));
     }
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
+      }
+      checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer,
+                                C_softmax,
+                                sizeof(DT) * total_tokens * num_new_tokens *
+                                    m->num_q_heads,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
     // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
     // softmax(QK.T/sqrt(d_k)).T
     {
@@ -1090,7 +1688,6 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
       // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
       // requests (all heads)
       DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      ;
       // matrix C: attn heads
       // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
       // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
@@ -1136,7 +1733,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
 
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -1206,6 +1803,70 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+    IncMultiHeadSelfAttentionMeta *m,
+    BatchConfig const *bc,
+    int shard_id,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &bias) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  bool use_bias = *m->qkv_bias || *m->final_bias;
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // assert(input.data_type == weight.data_type);
+  assert(input_grad.data_type == output_grad.data_type);
+  if (use_bias) {
+    assert(input_grad.data_type == bias.data_type);
+  }
+
+  if (input_grad.data_type == DT_HALF) {
+    assert(!m->offload);
+    half const *bias_ptr =
+        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_half_ptr(),
+                                                    weight.get_half_ptr(),
+                                                    output_grad.get_half_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else if (input_grad.data_type == DT_FLOAT) {
+    assert(!m->offload);
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_float_ptr(),
+                                                    weight.get_float_ptr(),
+                                                    output_grad.get_float_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed);
+  }
+}
+
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
@@ -1424,11 +2085,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
                                                              size_of_dt);
 
-    token_infos =
-        static_cast<BatchConfig::PerTokenInfo *>(handler.batch_config_metadata);
-    request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo));
+    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
+        handler.batch_config_metadata->tokens_info);
+    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
+        handler.batch_config_metadata->requestsInfo);
 
     if (offload) {
       // token_infos =
@@ -1478,6 +2138,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
   cudaStreamSynchronize(stream);
 }
 
diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp
index 7145af2108..8eeede65c7 100644
--- a/src/ops/kernels/batch_matmul.cpp
+++ b/src/ops/kernels/batch_matmul.cpp
@@ -13,13 +13,15 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/kernels/batch_matmul_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {}
+BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm)
+    : OpMeta(handler, bmm) {}
 
 namespace Kernels {
 namespace BatchMatmul {
diff --git a/src/ops/kernels/batch_matmul.cu b/src/ops/kernels/batch_matmul.cu
index ac280db1a4..97f13fa5a8 100644
--- a/src/ops/kernels/batch_matmul.cu
+++ b/src/ops/kernels/batch_matmul.cu
@@ -13,12 +13,14 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/kernels/batch_matmul_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {}
+BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm)
+    : OpMeta(handler, bmm) {}
 
 namespace Kernels {
 namespace BatchMatmul {
diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp
index 16b9b4cec0..1e561959f1 100644
--- a/src/ops/kernels/cast_kernels.cpp
+++ b/src/ops/kernels/cast_kernels.cpp
@@ -14,12 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/cast_kernels.h"
+#include "flexflow/ops/cast.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {}
+CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {}
 
 namespace Kernels {
 namespace Cast {
diff --git a/src/ops/kernels/cast_kernels.cu b/src/ops/kernels/cast_kernels.cu
index a96f37dbbd..fdce63b9f1 100644
--- a/src/ops/kernels/cast_kernels.cu
+++ b/src/ops/kernels/cast_kernels.cu
@@ -13,12 +13,13 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/cast.h"
 #include "flexflow/ops/kernels/cast_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {}
+CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {}
 
 namespace Kernels {
 namespace Cast {
diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp
index bf5d46b9cc..6c05e0143c 100644
--- a/src/ops/kernels/concat_kernels.cpp
+++ b/src/ops/kernels/concat_kernels.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/concat_kernels.h"
+#include "flexflow/ops/concat.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -23,6 +24,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Rect;
 
+ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc)
+    : OpMeta(handler, cc) {}
+
 namespace Kernels {
 namespace Concat {
 
diff --git a/src/ops/kernels/concat_kernels.cu b/src/ops/kernels/concat_kernels.cu
index f625560625..2569c36b21 100644
--- a/src/ops/kernels/concat_kernels.cu
+++ b/src/ops/kernels/concat_kernels.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/concat.h"
 #include "flexflow/ops/kernels/concat_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
@@ -22,6 +23,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Rect;
 
+ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc)
+    : OpMeta(handler, cc) {}
+
 namespace Kernels {
 namespace Concat {
 
diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp
index 7d2fa20c49..85a94ad6be 100644
--- a/src/ops/kernels/conv_2d_kernels.cpp
+++ b/src/ops/kernels/conv_2d_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/kernels/conv_2d_kernels.h"
+#include "flexflow/ops/conv_2d.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) {
+Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv)
+    : OpMeta(handler, conv) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&biasTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
@@ -326,7 +328,7 @@ void backward_kernel(Conv2DMeta const *m,
                        output_ptr,
                        n * c * h * w);
   }
-  // Compute filter gradiant
+  // Compute filter gradient
   // NOTE: we use alpha for kernel_grad to accumulate gradients
   checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn,
                                               &alpha,
@@ -341,7 +343,7 @@ void backward_kernel(Conv2DMeta const *m,
                                               kernel_grad_ptr,
                                               m->handle.workSpace,
                                               m->handle.workSpaceSize));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha for bias_grad to accumulate gradients
   if (bias_grad_ptr != NULL) {
     checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn,
@@ -352,7 +354,7 @@ void backward_kernel(Conv2DMeta const *m,
                                              m->biasTensor,
                                              bias_grad_ptr));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn,
diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu
index 6c0fd85496..661acdf732 100644
--- a/src/ops/kernels/conv_2d_kernels.cu
+++ b/src/ops/kernels/conv_2d_kernels.cu
@@ -1,9 +1,11 @@
+#include "flexflow/ops/conv_2d.h"
 #include "flexflow/ops/kernels/conv_2d_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) {
+Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv)
+    : OpMeta(handler, conv) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
@@ -309,7 +311,7 @@ void backward_kernel(Conv2DMeta const *m,
     reluBackward<<<GET_BLOCKS(n * c * h * w), CUDA_NUM_THREADS, 0, stream>>>(
         output_grad_ptr, output_ptr, n * c * h * w);
   }
-  // Compute filter gradiant
+  // Compute filter gradient
   // NOTE: we use alpha for kernel_grad to accumulate gradients
   checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn,
                                             &alpha,
@@ -324,7 +326,7 @@ void backward_kernel(Conv2DMeta const *m,
                                             &alpha,
                                             m->filterDesc,
                                             kernel_grad_ptr));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha for bias_grad to accumulate gradients
   if (bias_grad_ptr != NULL) {
     checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn,
@@ -335,7 +337,7 @@ void backward_kernel(Conv2DMeta const *m,
                                             m->biasTensor,
                                             bias_grad_ptr));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn,
diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp
index 14225f0bce..c8b1887fd4 100644
--- a/src/ops/kernels/dropout_kernels.cpp
+++ b/src/ops/kernels/dropout_kernels.cpp
@@ -28,7 +28,7 @@ DropoutMeta::DropoutMeta(FFHandler handler,
                          Dropout const *dropout,
                          Memory gpu_mem,
                          Domain const &output_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu
index e142bba83b..d65b951f51 100644
--- a/src/ops/kernels/dropout_kernels.cu
+++ b/src/ops/kernels/dropout_kernels.cu
@@ -27,7 +27,7 @@ DropoutMeta::DropoutMeta(FFHandler handler,
                          Dropout const *dropout,
                          Memory gpu_mem,
                          Domain const &output_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
diff --git a/src/ops/kernels/flat_kernels.cpp b/src/ops/kernels/flat_kernels.cpp
index be48854fc0..6815ce7492 100644
--- a/src/ops/kernels/flat_kernels.cpp
+++ b/src/ops/kernels/flat_kernels.cpp
@@ -14,11 +14,15 @@
  */
 
 #include "flexflow/ops/kernels/flat_kernels.h"
+#include "flexflow/ops/flat.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
+FlatMeta::FlatMeta(FFHandler handler, Flat const *flat)
+    : OpMeta(handler, flat) {}
+
 namespace Kernels {
 namespace Flat {
 
diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu
index 3836c02c94..fc0c0270c1 100644
--- a/src/ops/kernels/flat_kernels.cu
+++ b/src/ops/kernels/flat_kernels.cu
@@ -13,11 +13,15 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/flat.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
+FlatMeta::FlatMeta(FFHandler handler, Flat const *flat)
+    : OpMeta(handler, flat) {}
+
 namespace Kernels {
 namespace Flat {
 
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 072eb5e96b..a36d6719c9 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -14,6 +14,8 @@
  */
 
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -24,24 +26,53 @@ LinearMeta::LinearMeta(FFHandler handler,
                        Linear const *li,
                        MemoryAllocator gpu_mem_allocator,
                        int weightSize)
-    : OpMeta(handler, li) {
+    : OpMeta(handler, li), weight_ptr(nullptr) {
+  DataType data_type = li->data_type;
+  // allocate weight and bias in the reserve space for cpu offloading
+  if (li->offload) {
+    weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(
+        weightSize * data_type_size(data_type));
+    if (li->quantization_type != DT_NONE) {
+      quantized_weightSize = get_quantization_to_byte_size(
+          data_type, li->quantization_type, weightSize);
+      quantized_weight_ptr =
+          gpu_mem_allocator.allocate_reserved<char>(quantized_weightSize);
+    }
+  }
   // Allocate an all-one's vector
-  float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size);
-  for (int i = 0; i < batch_size; i++) {
-    dram_one_ptr[i] = 1.0f;
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, data_type_size(data_type) * batch_size);
+  one_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * batch_size);
+  int parallelism = batch_size;
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (data_type == DT_FLOAT) {
+    Kernels::Linear::Internal::
+        build_one_ptr<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>((float *)one_ptr, batch_size);
+  } else if (data_type == DT_HALF) {
+    Kernels::Linear::Internal::
+        build_one_ptr<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>((half *)one_ptr, batch_size);
   }
-  float *fb_one_ptr;
-  checkCUDA(hipMalloc(&fb_one_ptr, sizeof(float) * batch_size));
-  checkCUDA(hipMemcpy(fb_one_ptr,
-                      dram_one_ptr,
-                      sizeof(float) * batch_size,
-                      hipMemcpyHostToDevice));
-  one_ptr = (void *)fb_one_ptr;
+
   // Allocate descriptors
   checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
+
+  allocated_peft_buffer_size = 0;
+}
+
+LinearMeta::~LinearMeta(void) {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
 }
-LinearMeta::~LinearMeta(void) {}
 
 namespace Kernels {
 namespace Linear {
@@ -96,7 +127,61 @@ void forward_kernel_wrapper(LinearMeta const *m,
                             int batch_size) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::forward_kernel<float>(m,
+                                    input_ptr,
+                                    output_ptr,
+                                    weight_ptr,
+                                    bias_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    batch_size,
+                                    stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::forward_kernel<half>(m,
+                                   input_ptr,
+                                   output_ptr,
+                                   weight_ptr,
+                                   bias_ptr,
+                                   in_dim,
+                                   out_dim,
+                                   batch_size,
+                                   stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
+  }
+}
 
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
@@ -126,6 +211,67 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    stream);
   }
 
+  if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
+    // save input activation if needed for PEFT
+    if (bc->num_active_peft_tokens() > 0) {
+      // Check that we have at most one request that requires peft_bwd
+      int num_peft_requests = 0;
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_bwd) {
+          num_peft_requests++;
+        }
+      }
+      assert(num_peft_requests <= 1);
+
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        // Skip non-PEFT requests
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+        if (bc->requestsInfo[i].peft_bwd) {
+          size_t activation_size_needed =
+              data_type_size(m->output_type[0]) * max_peft_tokens * out_dim;
+          if (activation_size_needed > m->allocated_peft_buffer_size) {
+            MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+            m->output_activation_buffer =
+                allocator->allocate_instance_untyped(activation_size_needed);
+            m->allocated_peft_buffer_size = activation_size_needed;
+          }
+          // copy output activation
+          if (m->output_type[0] == DT_FLOAT) {
+            checkCUDA(hipMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<float *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                hipMemcpyDeviceToDevice,
+                stream));
+          } else if (m->output_type[0] == DT_HALF) {
+            checkCUDA(hipMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<half *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                hipMemcpyDeviceToDevice,
+                stream));
+          } else {
+            assert(false && "unsupport datatype in layernorm");
+          }
+        }
+      }
+    }
+  }
+
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -134,12 +280,60 @@ void forward_kernel_wrapper(LinearMeta const *m,
     checkCUDA(hipEventDestroy(t_start));
     checkCUDA(hipEventDestroy(t_end));
     printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed);
-    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
-    // "[Linear:forward:input]"); print_tensor<float>(acc_kernel.ptr,
-    // acc_kernel.rect.volume(), "[Linear:forward:kernel]");
-    // print_tensor<float>(acc_bias.ptr, acc_bias.rect.volume(),
-    // "[Linear:forward:bias]"); print_tensor<float>(acc_output.ptr,
-    // acc_output.rect.volume(), "[Linear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *weight_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
   }
 }
 
@@ -223,8 +417,20 @@ Parameter* Linear::get_parameter(int index)
   }
 }
 */
-
 namespace Internal {
+
+template <typename DT>
+__global__ void AddBiasWithReLU(DT *output_ptr,
+                                DT const *bias_ptr,
+                                int out_dim,
+                                int batch_size) {
+  CUDA_KERNEL_LOOP(i, out_dim * batch_size) {
+    int bias_idx = i % out_dim;
+    DT value = output_ptr[i] + bias_ptr[bias_idx];
+    output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f;
+  }
+}
+
 template <typename DT>
 void forward_kernel(LinearMeta const *m,
                     void const *input_ptr,
@@ -234,20 +440,57 @@ void forward_kernel(LinearMeta const *m,
                     int in_dim,
                     int out_dim,
                     int batch_size,
-                    hipStream_t stream) {
+                    ffStream_t stream) {
+  // additional processing for uploading weights
+  if (m->offload) {
+    // Note that we update weight_ptr when uploading weight
+    if (m->quantization_type != DT_NONE) {
+      checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr,
+                               weight_ptr,
+                               m->quantized_weightSize,
+                               hipMemcpyHostToDevice,
+                               stream));
+      if (m->quantization_type == DT_INT4) {
+        int parallelism = in_dim * out_dim / 2;
+        decompress_int4_general_weights<DT>
+            <<<GET_BLOCKS(parallelism),
+               min(CUDA_NUM_THREADS, parallelism),
+               0,
+               stream>>>(m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         in_dim,
+                         in_dim * out_dim);
+      } else {
+        assert(m->quantization_type == DT_INT8);
+        int parallelism = in_dim * out_dim;
+        decompress_int8_general_weights<DT>
+            <<<GET_BLOCKS(parallelism),
+               min(CUDA_NUM_THREADS, parallelism),
+               0,
+               stream>>>(m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         in_dim,
+                         in_dim * out_dim);
+      }
+
+    } else {
+      checkCUDA(hipMemcpyAsync(m->weight_ptr,
+                               weight_ptr,
+                               in_dim * out_dim * sizeof(DT),
+                               hipMemcpyHostToDevice,
+                               stream));
+    }
+  }
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
   DT alpha = 1.0f, beta = 0.0f;
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
-  hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  hipblasDatatype_t weight_type = m->offload
+                                      ? ff_to_cuda_datatype(m->weight_ptr_type)
+                                      : ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = output_type;
-#else
-  // TODO: currently use the output_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  assert(input_type == weight_type && weight_type == output_type);
   hipblasDatatype_t compute_type = output_type;
-#endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
                           HIPBLAS_OP_N,
@@ -255,7 +498,7 @@ void forward_kernel(LinearMeta const *m,
                           batch_size,
                           in_dim,
                           &alpha,
-                          weight_ptr,
+                          m->offload ? m->weight_ptr : weight_ptr,
                           weight_type,
                           in_dim,
                           input_ptr,
@@ -269,6 +512,16 @@ void forward_kernel(LinearMeta const *m,
                           HIPBLAS_GEMM_DEFAULT));
   // use_bias = True
   if (bias_ptr != NULL) {
+    // fuse bias and relu
+    if (m->activation == AC_MODE_RELU) {
+      int parallelism = out_dim * batch_size;
+      AddBiasWithReLU<<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream>>>(
+          static_cast<DT *>(output_ptr),
+          static_cast<DT const *>(bias_ptr),
+          out_dim,
+          batch_size);
+      return;
+    }
     checkCUDA(hipblasGemmEx(m->handle.blas,
                             HIPBLAS_OP_T,
                             HIPBLAS_OP_N,
@@ -306,7 +559,7 @@ void forward_kernel(LinearMeta const *m,
                        GET_BLOCKS(elements),
                        CUDA_NUM_THREADS,
                        0,
-                       0,
+                       stream,
                        elements,
                        B,
                        C,
@@ -318,6 +571,74 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+
+  hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  // update input_grad_ptr and output_grad_ptr offset
+  int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
+  input_grad_ptr =
+      static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
+  output_grad_ptr =
+      static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
+  hipblasDatatype_t compute_type = output_type;
+  int output_size = out_dim * num_peft_tokens;
+  if (m->activation == AC_MODE_RELU) {
+    relu_backward_kernel(m->output_type[0],
+                         output_grad_ptr,
+                         m->output_activation_buffer,
+                         output_size,
+                         stream);
+  } else if (m->activation == AC_MODE_SIGMOID) {
+    sigmoid_backward_kernel(m->output_type[0],
+                            output_grad_ptr,
+                            m->output_activation_buffer,
+                            output_size,
+                            stream);
+  } else {
+    // TODO: only support relu and sigmoid for now
+    assert(m->activation == AC_MODE_NONE);
+  }
+
+  // Compute data gradient
+  // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+  DT alpha = 1.0f;
+  DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+  if (input_grad_ptr != NULL) {
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_N,
+                            HIPBLAS_OP_N,
+                            in_dim,
+                            num_peft_tokens,
+                            out_dim,
+                            &alpha,
+                            kernel_ptr,
+                            weight_type,
+                            in_dim,
+                            output_grad_ptr,
+                            output_type,
+                            out_dim,
+                            &beta,
+                            input_grad_ptr,
+                            input_type,
+                            in_dim,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+}
+
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
@@ -335,16 +656,11 @@ void backward_kernel(LinearMeta const *m,
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
 
   DT alpha = 1.0f;
+  float sgeam_alpha = 1.0f;
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = output_type;
-#else
-  // TODO: currently use output_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = output_type;
-#endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
@@ -356,7 +672,7 @@ void backward_kernel(LinearMeta const *m,
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
   }
-  // Compute weight gradiant
+  // Compute weight gradient
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_N,
@@ -377,7 +693,27 @@ void backward_kernel(LinearMeta const *m,
                           in_dim,
                           compute_type,
                           HIPBLAS_GEMM_DEFAULT));
-  // Compute bias gradiant
+  if (m->kernel_reg_type == REG_MODE_NONE) {
+    // do nothing
+  } else if (m->kernel_reg_type == REG_MODE_L2) {
+    checkCUDA(hipblasSgeam(m->handle.blas,
+                           HIPBLAS_OP_N,
+                           HIPBLAS_OP_N,
+                           in_dim,
+                           out_dim,
+                           &sgeam_alpha,
+                           (float *)kernel_grad_ptr,
+                           in_dim,
+                           &(m->kernel_reg_lambda),
+                           (float *)kernel_ptr,
+                           in_dim,
+                           (float *)kernel_grad_ptr,
+                           in_dim));
+  } else {
+    assert(false && "Only L2 regularization is supported");
+  }
+
+  // Compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {
@@ -388,7 +724,7 @@ void backward_kernel(LinearMeta const *m,
                             out_dim,
                             batch_size,
                             &alpha,
-                            m->one_ptr,
+                            static_cast<DT *>(m->one_ptr),
                             HIPBLAS_R_32F,
                             1,
                             output_grad_ptr,
@@ -401,7 +737,7 @@ void backward_kernel(LinearMeta const *m,
                             compute_type,
                             HIPBLAS_GEMM_DEFAULT));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(hipblasGemmEx(m->handle.blas,
@@ -426,7 +762,14 @@ void backward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+__global__ void build_one_ptr(DT *one_ptr, int batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
+    one_ptr[i] = static_cast<DT>(1.0f);
+  }
+}
+
 } // namespace Internal
 } // namespace Linear
 } // namespace Kernels
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index c30c9f71c1..d4f930db6c 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -63,6 +63,8 @@ LinearMeta::LinearMeta(FFHandler handler,
   // Allocate descriptors
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+
+  allocated_peft_buffer_size = 0;
 }
 
 LinearMeta::~LinearMeta(void) {
@@ -170,6 +172,172 @@ void forward_kernel_wrapper(LinearMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::forward_kernel<float>(m,
+                                    input_ptr,
+                                    output_ptr,
+                                    weight_ptr,
+                                    bias_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    batch_size,
+                                    stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::forward_kernel<half>(m,
+                                   input_ptr,
+                                   output_ptr,
+                                   weight_ptr,
+                                   bias_ptr,
+                                   in_dim,
+                                   out_dim,
+                                   batch_size,
+                                   stream);
+  }
+
+  if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
+    // save input activation if needed for PEFT
+    if (bc->num_active_peft_tokens() > 0) {
+      // Check that we have at most one request that requires peft_bwd
+      int num_peft_requests = 0;
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_bwd) {
+          num_peft_requests++;
+        }
+      }
+      assert(num_peft_requests <= 1);
+
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        // Skip non-PEFT requests
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+        if (bc->requestsInfo[i].peft_bwd) {
+          size_t activation_size_needed =
+              data_type_size(m->output_type[0]) * max_peft_tokens * out_dim;
+          if (activation_size_needed > m->allocated_peft_buffer_size) {
+            MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+            m->output_activation_buffer =
+                allocator->allocate_instance_untyped(activation_size_needed);
+            m->allocated_peft_buffer_size = activation_size_needed;
+          }
+          // copy output activation
+          if (m->output_type[0] == DT_FLOAT) {
+            checkCUDA(cudaMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<float *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                cudaMemcpyDeviceToDevice,
+                stream));
+          } else if (m->output_type[0] == DT_HALF) {
+            checkCUDA(cudaMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<half *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                cudaMemcpyDeviceToDevice,
+                stream));
+          } else {
+            assert(false && "unsupport datatype in layernorm");
+          }
+        }
+      }
+    }
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [Linear] inference time = %.2lfms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *weight_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
+  }
+}
+
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -323,17 +491,7 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  cudaDataType_t compute_type = output_type;
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
                          CUBLAS_OP_N,
@@ -398,7 +556,7 @@ void forward_kernel(LinearMeta const *m,
     size_t elements = (size_t)out_dim * (size_t)batch_size;
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
     constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
-    gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS>>>(
+    gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, stream>>>(
         elements, B, C, (float *)output_ptr);
   } else if (m->activation == AC_MODE_NONE) {
     // Do nothing
@@ -407,6 +565,74 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  // update input_grad_ptr and output_grad_ptr offset
+  int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
+  input_grad_ptr =
+      static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
+  output_grad_ptr =
+      static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
+  cudaDataType_t compute_type = output_type;
+  int output_size = out_dim * num_peft_tokens;
+  if (m->activation == AC_MODE_RELU) {
+    relu_backward_kernel(m->output_type[0],
+                         output_grad_ptr,
+                         m->output_activation_buffer,
+                         output_size,
+                         stream);
+  } else if (m->activation == AC_MODE_SIGMOID) {
+    sigmoid_backward_kernel(m->output_type[0],
+                            output_grad_ptr,
+                            m->output_activation_buffer,
+                            output_size,
+                            stream);
+  } else {
+    // TODO: only support relu and sigmoid for now
+    assert(m->activation == AC_MODE_NONE);
+  }
+
+  // Compute data gradient
+  // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+  DT alpha = 1.0f;
+  DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+  if (input_grad_ptr != NULL) {
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           in_dim,
+                           num_peft_tokens,
+                           out_dim,
+                           &alpha,
+                           kernel_ptr,
+                           weight_type,
+                           in_dim,
+                           output_grad_ptr,
+                           output_type,
+                           out_dim,
+                           &beta,
+                           input_grad_ptr,
+                           input_type,
+                           in_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
@@ -428,17 +654,7 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  cudaDataType_t compute_type = output_type;
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
@@ -450,7 +666,7 @@ void backward_kernel(LinearMeta const *m,
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
   }
-  // Compute weight gradiant
+  // Compute weight gradient
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_N,
@@ -491,7 +707,7 @@ void backward_kernel(LinearMeta const *m,
     assert(false && "Only L2 regularization is supported");
   }
 
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {
@@ -515,7 +731,7 @@ void backward_kernel(LinearMeta const *m,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp
new file mode 100644
index 0000000000..c3c2cce3cf
--- /dev/null
+++ b/src/ops/kernels/lora_linear_kernels.cpp
@@ -0,0 +1,576 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/utils/hip_helper.h"
+#include <hip/hip_runtime.h>
+#include <random>
+#include <vector>
+
+namespace FlexFlow {
+
+LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
+    : OpMeta(handler, li) {
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
+}
+
+LoraLinearMeta::~LoraLinearMeta(void) {}
+
+namespace Kernels {
+namespace LoraLinear {
+
+void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::init_kernel<float>(m, seed, stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::init_kernel<half>(m, seed, stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
+
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::inference_kernel<float>(m,
+                                      bc,
+                                      input.get_float_ptr(),
+                                      output.get_float_ptr(),
+                                      in_dim,
+                                      out_dim,
+                                      stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::inference_kernel<half>(m,
+                                     bc,
+                                     input.get_half_ptr(),
+                                     output.get_half_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     bc,
+                                     input_grad.get_float_ptr(),
+                                     output_grad.get_float_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    bc,
+                                    input_grad.get_half_ptr(),
+                                    output_grad.get_half_ptr(),
+                                    in_dim,
+                                    out_dim,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+namespace Internal {
+
+template <typename DT>
+void init_kernel(LoraLinearMeta *m, int seed, hipStream_t stream) {
+  // Initialize generator
+  std::mt19937 gen(seed);
+
+  // Get handle to weights by iterating over m->model_state to get each
+  // LoraLinearWeight object
+  for (auto &model_state : m->model_state) {
+    LoraLinearWeight weight = model_state.second.weights;
+    int w0_num_elements = weight.rank * weight.in_dim;
+    int w1_num_elements = weight.rank * weight.out_dim;
+
+    // LoRA_A weight: [in_dim, rank]
+    float stdv_lora_a = 1.0f / sqrt(weight.in_dim);
+    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+    std::vector<DT> lora_a_random_init(w0_num_elements);
+    for (auto &num : lora_a_random_init) {
+      float num_float = dis_lora_a(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(hipMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                             lora_a_random_init.data(),
+                             w0_num_elements * sizeof(DT),
+                             hipMemcpyHostToDevice,
+                             stream));
+
+    // LoRA_B weight: [rank, out_dim]
+    float stdv_lora_b = 1.0f / sqrt(weight.rank);
+    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+    std::vector<float> lora_b_random_init(w1_num_elements);
+    for (auto &num : lora_b_random_init) {
+      float num_float = dis_lora_b(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(hipMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
+                             lora_b_random_init.data(),
+                             w1_num_elements * sizeof(DT),
+                             hipMemcpyHostToDevice,
+                             stream));
+  }
+}
+
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  DT alpha = 1.0f, beta = 0.0f;
+  hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipblasDatatype_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+  hipblasDatatype_t lr_actv_type = output_type;
+  assert(input_type == output_type);
+  hipblasDatatype_t weight_type = output_type;
+  hipblasDatatype_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipDataType compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  int num_peft_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+    }
+  }
+  // Assert that we have at most one request that requires peft_bwd
+  assert(num_peft_requests <= 1);
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    void *intermediate_result_ptr = nullptr;
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed1 =
+          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+      size_t activation_size_needed2 =
+          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
+        m->input_activation =
+            allocator->allocate_instance_untyped(activation_size_needed1);
+        m->allocated_peft_buffer_size1 = activation_size_needed1;
+      }
+      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
+        m->low_rank_activation =
+            allocator->allocate_instance_untyped(activation_size_needed2);
+        m->allocated_peft_buffer_size2 = activation_size_needed2;
+      }
+      // copy input activation
+      checkCUDA(hipMemcpyAsync(m->input_activation,
+                               input_ptr + first_token_offset * in_dim,
+                               data_type_size(m->input_type[0]) *
+                                   num_peft_tokens * in_dim,
+                               hipMemcpyDeviceToDevice,
+                               stream));
+      intermediate_result_ptr = m->low_rank_activation;
+    } else {
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >=
+             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      intermediate_result_ptr = m->handle.workSpace;
+    }
+    // buffer = weight_first * input
+    // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            rank,
+                            num_peft_tokens,
+                            in_dim,
+                            &alpha,
+                            weight.w0_ptr,
+                            weight_type,
+                            in_dim,
+                            input_ptr + first_token_offset * in_dim,
+                            input_type,
+                            in_dim,
+                            &beta,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            rank,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+    // output = weight_second * buffer
+    // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+    // Note that we use alpha in both places since we do
+    // an in-place update for LoraLinear
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            out_dim,
+                            num_peft_tokens,
+                            rank,
+                            &scaling_constant,
+                            weight.w1_ptr,
+                            weight_type,
+                            rank,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            rank,
+                            &alpha,
+                            output_ptr + first_token_offset * out_dim,
+                            output_type,
+                            out_dim,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+}
+
+template <typename DT>
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           DT const *WGrad,
+                           DT *V,
+                           DT *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    DT gt = WGrad[i] + (DT)weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * (DT)momentum + gt;
+      if (nesterov) {
+        gt = gt + (DT)momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= (DT)lr * gt;
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  assert(input_type == output_type);
+  hipblasDatatype_t weight_type = output_type;
+  hipblasDatatype_t lr_actv_type = output_type;
+  hipblasDatatype_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipDataType compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+
+    // Compute LORA_B weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_T,
+                              rank,
+                              out_dim,
+                              num_peft_tokens,
+                              &scaling_constant,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              output_grad_ptr,
+                              output_type,
+                              out_dim,
+                              &beta,
+                              weight.w1_grad_ptr,
+                              weight_type,
+                              rank,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+
+    // Compute LORA_B input's (and LORA_A output's) gradient inplace in
+    // low_rank_activation
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_N,
+                              rank,
+                              num_peft_tokens,
+                              out_dim,
+                              &scaling_constant,
+                              weight.w1_ptr,
+                              weight_type,
+                              rank,
+                              output_grad_ptr,
+                              output_type,
+                              out_dim,
+                              &beta,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+
+    // Compute LORA_A weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_T,
+                              in_dim,
+                              rank,
+                              num_peft_tokens,
+                              &alpha,
+                              m->input_activation,
+                              input_type,
+                              in_dim,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              &beta,
+                              weight.w0_grad_ptr,
+                              weight_type,
+                              in_dim,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+    // Compute input gradient
+    // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+    if (input_grad_ptr != nullptr) {
+      DT alpha = 1.0f;
+      DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_N,
+                              in_dim,
+                              num_peft_tokens,
+                              rank,
+                              &alpha,
+                              weight.w0_ptr,
+                              weight_type,
+                              in_dim,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              &beta,
+                              input_grad_ptr,
+                              input_type,
+                              in_dim,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+
+    if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
+      LoraOptimizerConfig const *optimizer_config =
+          m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config;
+      assert(optimizer_config != nullptr);
+      assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig));
+      int w0_num_elements = rank * in_dim;
+      int w1_num_elements = rank * out_dim;
+
+      // Get optimizer config
+      if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+        LoraSGDOptimizerConfig const *sgd_config =
+            (LoraSGDOptimizerConfig const *)optimizer_config;
+        // LoRA_A weight is split in tensor parallelism, so no need to apply
+        // all-reduce
+        sgd_update<<<GET_BLOCKS(w0_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w0_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w0_grad_ptr),
+                               static_cast<DT *>(weight.w0_v_values_ptr),
+                               static_cast<DT *>(weight.w0_ptr));
+        // LoRA_B weight is replicated w tensor parallelism, so we need to sync
+        // and sum first
+        ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]);
+        checkCUDA(ncclAllReduce(static_cast<DT const *>(weight.w1_grad_ptr),
+                                static_cast<DT *>(weight.w1_grad_ptr),
+                                w1_num_elements,
+                                nccl_data_type,
+                                ncclSum,
+                                m->handle.ncclComm,
+                                stream));
+        sgd_update<<<GET_BLOCKS(w1_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w1_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w1_grad_ptr),
+                               static_cast<DT *>(weight.w1_v_values_ptr),
+                               static_cast<DT *>(weight.w1_ptr));
+      } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) {
+        assert(false && "Adam optimizer type not implemented yet");
+      } else {
+        assert(false && "Unsupported optimizer type");
+      }
+    }
+  }
+}
+
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
new file mode 100644
index 0000000000..5f130782aa
--- /dev/null
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -0,0 +1,579 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/utils/cuda_helper.h"
+#include <random>
+#include <vector>
+
+namespace FlexFlow {
+
+LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
+    : OpMeta(handler, li) {
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
+}
+
+LoraLinearMeta::~LoraLinearMeta(void) {}
+
+namespace Kernels {
+namespace LoraLinear {
+
+void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::init_kernel<float>(m, seed, stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::init_kernel<half>(m, seed, stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
+
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::inference_kernel<float>(m,
+                                      bc,
+                                      input.get_float_ptr(),
+                                      output.get_float_ptr(),
+                                      in_dim,
+                                      out_dim,
+                                      stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::inference_kernel<half>(m,
+                                     bc,
+                                     input.get_half_ptr(),
+                                     output.get_half_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     bc,
+                                     input_grad.get_float_ptr(),
+                                     output_grad.get_float_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    bc,
+                                    input_grad.get_half_ptr(),
+                                    output_grad.get_half_ptr(),
+                                    in_dim,
+                                    out_dim,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+namespace Internal {
+
+template <typename DT>
+void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) {
+  // Initialize generator
+  std::mt19937 gen(seed);
+
+  // Get handle to weights by iterating over m->model_state to get each
+  // LoraLinearWeight object
+  for (auto &model_state : m->model_state) {
+    LoraLinearWeight weight = model_state.second.weights;
+    int w0_num_elements = weight.rank * weight.in_dim;
+    int w1_num_elements = weight.rank * weight.out_dim;
+
+    // LoRA_A weight: [in_dim, rank]
+    float stdv_lora_a = 1.0f / sqrt(weight.in_dim);
+    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+    std::vector<DT> lora_a_random_init(w0_num_elements);
+    for (auto &num : lora_a_random_init) {
+      float num_float = dis_lora_a(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                              lora_a_random_init.data(),
+                              w0_num_elements * sizeof(DT),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    // LoRA_B weight: [rank, out_dim]
+    float stdv_lora_b = 1.0f / sqrt(weight.rank);
+    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+    std::vector<float> lora_b_random_init(w1_num_elements);
+    for (auto &num : lora_b_random_init) {
+      float num_float = dis_lora_b(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
+                              lora_b_random_init.data(),
+                              w1_num_elements * sizeof(DT),
+                              cudaMemcpyHostToDevice,
+                              stream));
+  }
+}
+
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  DT alpha = 1.0f, beta = 0.0f;
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+  cudaDataType_t lr_actv_type = output_type;
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
+  cudaDataType_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  int num_peft_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+    }
+  }
+  // Assert that we have at most one request that requires peft_bwd
+  assert(num_peft_requests <= 1);
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    void *intermediate_result_ptr = nullptr;
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed1 =
+          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+      size_t activation_size_needed2 =
+          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
+        m->input_activation =
+            allocator->allocate_instance_untyped(activation_size_needed1);
+        m->allocated_peft_buffer_size1 = activation_size_needed1;
+      }
+      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
+        m->low_rank_activation =
+            allocator->allocate_instance_untyped(activation_size_needed2);
+        m->allocated_peft_buffer_size2 = activation_size_needed2;
+      }
+      // copy input activation
+      checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                input_ptr + first_token_offset * in_dim,
+                                data_type_size(m->input_type[0]) *
+                                    num_peft_tokens * in_dim,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+      intermediate_result_ptr = m->low_rank_activation;
+    } else {
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >=
+             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      intermediate_result_ptr = m->handle.workSpace;
+    }
+    // buffer = weight_first * input
+    // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           rank,
+                           num_peft_tokens,
+                           in_dim,
+                           &alpha,
+                           weight.w0_ptr,
+                           weight_type,
+                           in_dim,
+                           input_ptr + first_token_offset * in_dim,
+                           input_type,
+                           in_dim,
+                           &beta,
+                           intermediate_result_ptr,
+                           lr_actv_type,
+                           rank,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // output = weight_second * buffer
+    // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+    // Note that we use alpha in both places since we do
+    // an in-place update for LoraLinear
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           out_dim,
+                           num_peft_tokens,
+                           rank,
+                           &scaling_constant,
+                           weight.w1_ptr,
+                           weight_type,
+                           rank,
+                           intermediate_result_ptr,
+                           lr_actv_type,
+                           rank,
+                           &alpha,
+                           output_ptr + first_token_offset * out_dim,
+                           output_type,
+                           out_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
+template <typename DT>
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           DT const *WGrad,
+                           DT *V,
+                           DT *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    DT gt = WGrad[i] + (DT)weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * (DT)momentum + gt;
+      if (nesterov) {
+        gt = gt + (DT)momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= (DT)lr * gt;
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
+  cudaDataType_t lr_actv_type = output_type;
+  cudaDataType_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+
+    // Compute LORA_B weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
+                             rank,
+                             out_dim,
+                             num_peft_tokens,
+                             &scaling_constant,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             output_grad_ptr,
+                             output_type,
+                             out_dim,
+                             &beta,
+                             weight.w1_grad_ptr,
+                             weight_type,
+                             rank,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+
+    // Compute LORA_B input's (and LORA_A output's) gradient inplace in
+    // low_rank_activation
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             rank,
+                             num_peft_tokens,
+                             out_dim,
+                             &scaling_constant,
+                             weight.w1_ptr,
+                             weight_type,
+                             rank,
+                             output_grad_ptr,
+                             output_type,
+                             out_dim,
+                             &beta,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+
+    // Compute LORA_A weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
+                             in_dim,
+                             rank,
+                             num_peft_tokens,
+                             &alpha,
+                             m->input_activation,
+                             input_type,
+                             in_dim,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             &beta,
+                             weight.w0_grad_ptr,
+                             weight_type,
+                             in_dim,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Compute input gradient
+    // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+    if (input_grad_ptr != nullptr) {
+      DT alpha = 1.0f;
+      DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             in_dim,
+                             num_peft_tokens,
+                             rank,
+                             &alpha,
+                             weight.w0_ptr,
+                             weight_type,
+                             in_dim,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             &beta,
+                             input_grad_ptr,
+                             input_type,
+                             in_dim,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+
+    if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
+      LoraOptimizerConfig const *optimizer_config =
+          m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config;
+      assert(optimizer_config != nullptr);
+      assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig));
+      int w0_num_elements = rank * in_dim;
+      int w1_num_elements = rank * out_dim;
+
+      // Get optimizer config
+      if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+        LoraSGDOptimizerConfig const *sgd_config =
+            (LoraSGDOptimizerConfig const *)optimizer_config;
+        // LoRA_A weight is split in tensor parallelism, so no need to apply
+        // all-reduce
+        sgd_update<<<GET_BLOCKS(w0_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w0_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w0_grad_ptr),
+                               static_cast<DT *>(weight.w0_v_values_ptr),
+                               static_cast<DT *>(weight.w0_ptr));
+        // LoRA_B weight is replicated w tensor parallelism, so we need to sync
+        // and sum first
+#ifdef FF_USE_NCCL
+        ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]);
+        checkCUDA(ncclAllReduce(static_cast<DT const *>(weight.w1_grad_ptr),
+                                static_cast<DT *>(weight.w1_grad_ptr),
+                                w1_num_elements,
+                                nccl_data_type,
+                                ncclSum,
+                                m->handle.ncclComm,
+                                stream));
+#else
+        assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
+        sgd_update<<<GET_BLOCKS(w1_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w1_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w1_grad_ptr),
+                               static_cast<DT *>(weight.w1_v_values_ptr),
+                               static_cast<DT *>(weight.w1_ptr));
+      } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) {
+        assert(false && "Adam optimizer type not implemented yet");
+      } else {
+        assert(false && "Unsupported optimizer type");
+      }
+    }
+  }
+}
+
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp
index 8af85612ca..b3f20a35dd 100644
--- a/src/ops/kernels/pool_2d_kernels.cpp
+++ b/src/ops/kernels/pool_2d_kernels.cpp
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
+#include "flexflow/ops/pool_2d.h"
 #include "flexflow/utils/hip_helper.h"
 
 namespace FlexFlow {
 
-Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) {
+Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool)
+    : OpMeta(handler, pool) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc));
diff --git a/src/ops/kernels/pool_2d_kernels.cu b/src/ops/kernels/pool_2d_kernels.cu
index b418d20cd3..c236f049ba 100644
--- a/src/ops/kernels/pool_2d_kernels.cu
+++ b/src/ops/kernels/pool_2d_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
+#include "flexflow/ops/pool_2d.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) {
+Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool)
+    : OpMeta(handler, pool) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
diff --git a/src/ops/kernels/reshape_kernels.cpp b/src/ops/kernels/reshape_kernels.cpp
index b17d95bfea..47f407fd82 100644
--- a/src/ops/kernels/reshape_kernels.cpp
+++ b/src/ops/kernels/reshape_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/kernels/reshape_kernels.h"
+#include "flexflow/ops/reshape.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {}
+ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape)
+    : OpMeta(handler, reshape) {}
 
 namespace Kernels {
 namespace Reshape {
diff --git a/src/ops/kernels/reshape_kernels.cu b/src/ops/kernels/reshape_kernels.cu
index 9786f63815..0a2b01ae52 100644
--- a/src/ops/kernels/reshape_kernels.cu
+++ b/src/ops/kernels/reshape_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/reshape_kernels.h"
+#include "flexflow/ops/reshape.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {}
+ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape)
+    : OpMeta(handler, reshape) {}
 
 namespace Kernels {
 namespace Reshape {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 6906556452..016364edfd 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -22,18 +22,16 @@
 namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
+
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
                                          ResidualRMSNorm const *rms,
                                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
+  inplace_residual = rms->inplace_residual;
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
   num_elements = in_dim * batch_size;
@@ -47,12 +45,14 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
     reserveInst.destroy();
   }
 }
+
 namespace Kernels {
 namespace ResidualRMSNorm {
 
@@ -78,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -87,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -109,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     X_out[index] = X1[index] + X2[index];
     sum +=
         (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -128,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
+    Y[index] = static_cast<T>(static_cast<float>(X_out[index]) *
+                              static_cast<float>(rms[i]));
+    output[index] = static_cast<T>(static_cast<float>(Y[index]) *
+                                   static_cast<float>(weights[index % N]));
   }
 }
 
@@ -144,19 +138,10 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T *residual_output_ptr,
                     T *output_ptr,
                     hipStream_t stream) {
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualRMSNormFusedForwardKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->batch_size,
+                     std::min(CUDA_NUM_THREADS, m->in_dim),
                      0,
                      stream,
                      m->in_dim,
@@ -178,7 +163,57 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  assert(input1.data_type == input2.data_type);
+  assert(output.data_type == input1.data_type);
+  assert(weight.data_type == output.data_type);
+  assert(residual_output.data_type == output.data_type);
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input1.get_half_ptr(),
+                   input2.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   residual_output.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input1.get_float_ptr(),
+                   input2.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   residual_output.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
 
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
@@ -211,6 +246,67 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
     assert(false && "Unsupported data type");
   }
 
+  // save input activation if needed for PEFT. This must be done after the
+  // forward kernel since that's where we add the residual
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              residual_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              residual_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -222,6 +318,288 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ float ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<float>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dX1_residual,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX1,
+                                          T *dX2,
+                                          bool reset_input_grad1,
+                                          bool reset_input_grad2) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad1) {
+      dX1[index] = static_cast<T>(dX_val);
+    } else {
+      dX1[index] = dX1_residual[index] + static_cast<T>(dX_val);
+    }
+    if (reset_input_grad2) {
+      dX2[index] = static_cast<T>(dX1[index]);
+    } else {
+      dX2[index] += static_cast<T>(dX1[index]);
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(ResidualRMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *residual_output_rms_input_ptr,
+                     T *residual_input0_grad_ptr,
+                     T *residual_input1_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     hipStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+          N,
+          output_grad_ptr,
+          residual_output_rms_input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      N,
+      nullptr,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      residual_input0_grad_ptr,
+      residual_input1_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1]);
+
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
+}
+
+template <typename T>
+void peft_bwd_kernel(ResidualRMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_0_ptr,
+                     T const *output_grad_1_ptr,
+                     T *input_grad_0_ptr,
+                     T *input_grad_1_ptr,
+                     T const *weight_ptr,
+                     hipStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->in_dim;
+
+    T const *residual_output_rms_input_ptr =
+        static_cast<T *>(m->input_activation);
+
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_0_ptr,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_0_ptr,
+            input_grad_1_ptr,
+            m->reset_input_grads[0],
+            m->reset_input_grads[1]);
+  }
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
+*/
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(output_grad.data_type == residual_output_rms_input.data_type);
+  assert(residual_output_rms_input.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
+  assert(residual_input1_grad.data_type == weight.data_type);
+  assert(weight.data_type == weight_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    residual_output_rms_input.get_half_ptr(),
+                    residual_input0_grad.get_half_ptr(),
+                    residual_input1_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    residual_output_rms_input.get_float_ptr(),
+                    residual_input0_grad.get_float_ptr(),
+                    residual_input1_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
+                             GenericTensorAccessorR const &weight) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(output_grad_1.data_type == input_grad_0.data_type);
+  assert(input_grad_0.data_type == input_grad_1.data_type);
+  assert(input_grad_1.data_type == weight.data_type);
+
+  if (output_grad_1.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_half_ptr(),
+                    output_grad_1.get_half_ptr(),
+                    input_grad_0.get_half_ptr(),
+                    input_grad_1.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad_1.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_float_ptr(),
+                    output_grad_1.get_float_ptr(),
+                    input_grad_0.get_float_ptr(),
+                    input_grad_1.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 17ac14449b..0d44f0260a 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -24,17 +24,14 @@ namespace FlexFlow {
 using Legion::coord_t;
 
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
                                          ResidualRMSNorm const *rms,
                                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
+  inplace_residual = rms->inplace_residual;
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
   num_elements = in_dim * batch_size;
@@ -48,6 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -80,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -89,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -111,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     X_out[index] = X1[index] + X2[index];
     sum +=
         (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -130,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
+    Y[index] = static_cast<T>(static_cast<float>(X_out[index]) *
+                              static_cast<float>(rms[i]));
+    output[index] = static_cast<T>(static_cast<float>(Y[index]) *
+                                   static_cast<float>(weights[index % N]));
   }
 }
 
@@ -147,26 +139,17 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T *output_ptr,
                     cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   ResidualRMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input1_ptr,
-                                               input2_ptr,
-                                               residual_output_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+      <<<m->batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>(
+          m->in_dim,
+          m->eps,
+          input1_ptr,
+          input2_ptr,
+          residual_output_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr),
+          weight_ptr,
+          output_ptr);
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
@@ -219,6 +202,401 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  assert(input1.data_type == input2.data_type);
+  assert(output.data_type == input1.data_type);
+  assert(weight.data_type == output.data_type);
+  assert(residual_output.data_type == output.data_type);
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input1.get_half_ptr(),
+                   input2.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   residual_output.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input1.get_float_ptr(),
+                   input2.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   residual_output.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  // save input activation if needed for PEFT. This must be done after the
+  // forward kernel since that's where we add the residual
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              residual_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              residual_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ float ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<float>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dX1_residual,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX1,
+                                          T *dX2,
+                                          bool reset_input_grad1,
+                                          bool reset_input_grad2) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad1) {
+      dX1[index] = static_cast<T>(dX_val);
+    } else {
+      dX1[index] = dX1_residual[index] + static_cast<T>(dX_val);
+    }
+    if (reset_input_grad2) {
+      dX2[index] = static_cast<T>(dX1[index]);
+    } else {
+      dX2[index] += static_cast<T>(dX1[index]);
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(ResidualRMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *residual_output_rms_input_ptr,
+                     T *residual_input0_grad_ptr,
+                     T *residual_input1_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     cudaStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+          N,
+          output_grad_ptr,
+          residual_output_rms_input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      N,
+      nullptr,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      residual_input0_grad_ptr,
+      residual_input1_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1]);
+
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
+}
+
+template <typename T>
+void peft_bwd_kernel(ResidualRMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_0_ptr,
+                     T const *output_grad_1_ptr,
+                     T *input_grad_0_ptr,
+                     T *input_grad_1_ptr,
+                     T const *weight_ptr,
+                     cudaStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->in_dim;
+
+    T const *residual_output_rms_input_ptr =
+        static_cast<T *>(m->input_activation);
+
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_0_ptr,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_0_ptr,
+            input_grad_1_ptr,
+            m->reset_input_grads[0],
+            m->reset_input_grads[1]);
+  }
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
+*/
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(output_grad.data_type == residual_output_rms_input.data_type);
+  assert(residual_output_rms_input.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
+  assert(residual_input1_grad.data_type == weight.data_type);
+  assert(weight.data_type == weight_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    residual_output_rms_input.get_half_ptr(),
+                    residual_input0_grad.get_half_ptr(),
+                    residual_input1_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    residual_output_rms_input.get_float_ptr(),
+                    residual_input0_grad.get_float_ptr(),
+                    residual_input1_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
+                             GenericTensorAccessorR const &weight) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(output_grad_1.data_type == input_grad_0.data_type);
+  assert(input_grad_0.data_type == input_grad_1.data_type);
+  assert(input_grad_1.data_type == weight.data_type);
+
+  if (output_grad_1.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_half_ptr(),
+                    output_grad_1.get_half_ptr(),
+                    input_grad_0.get_half_ptr(),
+                    input_grad_1.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad_1.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_float_ptr(),
+                    output_grad_1.get_float_ptr(),
+                    input_grad_0.get_float_ptr(),
+                    input_grad_1.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 24ab7051e6..4158628005 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -23,16 +23,12 @@ namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 RMSNormMeta::RMSNormMeta(FFHandler handler,
                          RMSNorm const *rms,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
@@ -47,12 +43,14 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 RMSNormMeta::~RMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
     reserveInst.destroy();
   }
 }
+
 namespace Kernels {
 namespace RMSNorm {
 
@@ -78,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -87,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -107,16 +103,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -124,10 +115,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rms[i]);
+    Y[index] = static_cast<T>(X[index]) * static_cast<T>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
 }
@@ -138,19 +128,10 @@ void forward_kernel(RMSNormMeta const *m,
                     T const *weight_ptr,
                     T *output_ptr,
                     hipStream_t stream) {
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormFusedForwardKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->batch_size,
+                     std::min(CUDA_NUM_THREADS, m->in_dim),
                      0,
                      stream,
                      m->in_dim,
@@ -204,6 +185,363 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  assert(output.data_type == input.data_type);
+  assert(weight.data_type == output.data_type);
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+
+        if (input.data_type == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (input.data_type == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ T ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<T>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX,
+                                          bool reset_input_grad) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad) {
+      dX[index] = dX_val;
+    } else {
+      dX[index] += dX_val;
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(RMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *input_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     hipStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                     M,
+                     std::min(N, CUDA_NUM_THREADS),
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     input_ptr,
+                     weight_ptr,
+                     static_cast<T *>(m->rms_ptr),
+                     static_cast<T *>(m->norm_ptr));
+
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel<T>),
+                     M,
+                     std::min(N, CUDA_NUM_THREADS),
+                     0,
+                     stream,
+                     m->in_dim,
+                     output_grad_ptr,
+                     input_ptr,
+                     weight_ptr,
+                     static_cast<T *>(m->rms_ptr),
+                     static_cast<T *>(m->norm_ptr),
+                     input_grad_ptr,
+                     m->reset_input_grads[0]);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBackwardCUDAKernel<T>),
+                     M,
+                     std::min(N, CUDA_NUM_THREADS),
+                     0,
+                     stream,
+                     M,
+                     N,
+                     output_grad_ptr,
+                     input_ptr,
+                     static_cast<T *>(m->rms_ptr),
+                     weight_grad_ptr);
+}
+
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(input_grad.data_type == input.data_type);
+  assert(weight_grad.data_type == weight.data_type);
+  assert(output_grad.data_type == input.data_type);
+  assert(weight.data_type == output_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+void peft_bwd_kernel(RMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     hipStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->num_elements;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                       M,
+                       std::min(N, CUDA_NUM_THREADS),
+                       0,
+                       stream,
+                       N,
+                       output_grad_ptr,
+                       static_cast<T *>(m->input_activation),
+                       weight_ptr,
+                       static_cast<T *>(m->rms_ptr),
+                       static_cast<T *>(m->norm_ptr));
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel<T>),
+                       M,
+                       std::min(N, CUDA_NUM_THREADS),
+                       0,
+                       stream,
+                       m->in_dim,
+                       output_grad_ptr,
+                       static_cast<T *>(m->input_activation),
+                       weight_ptr,
+                       static_cast<T *>(m->rms_ptr),
+                       static_cast<T *>(m->norm_ptr),
+                       input_grad_ptr,
+                       m->reset_input_grads[0]);
+  }
+}
+
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(output_grad.data_type == weight.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 7c9f4a9f98..dd6ada864d 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -24,16 +24,12 @@ namespace FlexFlow {
 using Legion::coord_t;
 
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 RMSNormMeta::RMSNormMeta(FFHandler handler,
                          RMSNorm const *rms,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
@@ -48,6 +44,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 RMSNormMeta::~RMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -96,66 +93,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
-#ifdef DEADCODE
-template <typename T>
-__global__ void
-    RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) {
-  __shared__ float v_shared[C10_WARP_SIZE];
-  long long const i = blockIdx.x;
-  float sum = 0.0f;
-  for (long long j = threadIdx.x; j < N; j += blockDim.x) {
-    long long const index = i * N + j;
-    sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
-  }
-  sum = BlockReduceSum<float>(sum,
-                              v_shared); // use BlockReduceSum() to sum X_ij^2
-
-  if (threadIdx.x == 0) {
-    rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
-  }
-}
-
-template <typename T>
-__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void elewise_apply_weights(int64_t batch_size,
-                                      int64_t in_dim,
-                                      T const *norm,
-                                      T const *weights,
-                                      T *output) {
-  CUDA_KERNEL_LOOP(i, batch_size * in_dim) {
-    output[i] = norm[i] * weights[i % in_dim];
-  }
-}
-#endif
-
 template <typename T>
 __global__ void RMSNormFusedForwardKernel(int64_t N,
                                           float eps,
@@ -167,16 +104,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -184,10 +116,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rms[i]);
+    Y[index] = static_cast<T>(X[index]) * static_cast<T>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
 }
@@ -199,24 +130,15 @@ void forward_kernel(RMSNormMeta const *m,
                     T *output_ptr,
                     cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   RMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+      <<<m->batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>(
+          m->in_dim,
+          m->eps,
+          input_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr),
+          weight_ptr,
+          output_ptr);
 }
 
 void forward_kernel_wrapper(RMSNormMeta const *m,
@@ -261,6 +183,346 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  assert(output.data_type == input.data_type);
+  assert(weight.data_type == output.data_type);
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+
+        if (input.data_type == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (input.data_type == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ T ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<T>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX,
+                                          bool reset_input_grad) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad) {
+      dX[index] = dX_val;
+    } else {
+      dX[index] += dX_val;
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(RMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *input_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     cudaStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+          N,
+          output_grad_ptr,
+          input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      m->in_dim,
+      output_grad_ptr,
+      input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      input_grad_ptr,
+      m->reset_input_grads[0]);
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
+}
+
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(input_grad.data_type == input.data_type);
+  assert(weight_grad.data_type == weight.data_type);
+  assert(output_grad.data_type == input.data_type);
+  assert(weight.data_type == output_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+void peft_bwd_kernel(RMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     cudaStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->num_elements;
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_ptr,
+            static_cast<T *>(m->input_activation),
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            m->in_dim,
+            output_grad_ptr,
+            static_cast<T *>(m->input_activation),
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_ptr,
+            m->reset_input_grads[0]);
+  }
+}
+
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(output_grad.data_type == weight.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index 89c9f14a01..fa31c5adff 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -25,13 +25,13 @@ using Legion::Domain;
 SoftmaxMeta::SoftmaxMeta(FFHandler handler,
                          Softmax const *softmax,
                          Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, softmax) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
-  checkCUDNN(
-      cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      inputTensor, input_domain, softmax->data_type));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
-  checkCUDNN(
-      cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      outputTensor, input_domain, softmax->data_type));
   dim = softmax->dim;
   profiling = softmax->profiling;
   inference_debugging = softmax->inference_debugging;
@@ -41,20 +41,26 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
 namespace Kernels {
 namespace Softmax {
 
-template <typename DT>
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr) {
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
-  Internal::forward_kernel(m, input_ptr, output_ptr, stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::forward_kernel(
+        m, input.get_float_ptr(), output.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::forward_kernel(
+        m, input.get_half_ptr(), output.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -70,11 +76,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template <typename DT>
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
-                             size_t num_elements) {
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
@@ -84,8 +88,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
-  Internal::backward_kernel(
-      input_grad_ptr, output_grad_ptr, num_elements, stream);
+  assert(input_grad.domain == output_grad.domain);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::backward_kernel(m,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::backward_kernel(m,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -101,21 +119,112 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                            float const *input_ptr,
-                                            float *output_ptr);
-template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                           half const *input_ptr,
-                                           half *output_ptr);
-
-template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                             float *input_grad_ptr,
-                                             float const *output_grad_ptr,
-                                             size_t num_elements);
-template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                            half *input_grad_ptr,
-                                            half const *output_grad_ptr,
-                                            size_t num_elements);
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              bool is_last_op,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_float_ptr(),
+                               output.get_float_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(hipMemcpyAsync(output_grad.get_float_ptr(),
+                               output.get_float_ptr(),
+                               output.domain.get_volume() * sizeof(float),
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_half_ptr(),
+                               output.get_half_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(hipMemcpyAsync(output_grad.get_half_ptr(),
+                               output.get_half_ptr(),
+                               output.domain.get_volume() * sizeof(half),
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              num_classes,
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              num_classes,
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
 
 namespace Internal {
 template <typename DT>
@@ -138,7 +247,8 @@ void forward_kernel(SoftmaxMeta const *m,
 }
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      hipStream_t stream) {
@@ -149,6 +259,116 @@ void backward_kernel(DT *input_grad_ptr,
                            stream));
 }
 
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_classes,
+                      hipStream_t stream) {
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+
+  float alpha = 1.0f, beta = 0.0f;
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor,
+                                         cudnn_data_type,
+                                         bc->num_active_tokens(),
+                                         num_classes,
+                                         1,
+                                         1));
+  checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
+                                     &alpha,
+                                     m->outputTensor,
+                                     input_ptr,
+                                     &beta,
+                                     m->outputTensor,
+                                     output_ptr,
+                                     MIOPEN_SOFTMAX_ACCURATE,
+                                     MIOPEN_SOFTMAX_MODE_CHANNEL));
+}
+
+template <typename DT>
+__global__ void sparse_categorical_crossentropy_loss_peft_backward(
+    DT *input_grad,
+    DT const *output_grad,
+    BatchConfig::TokenId const *token_ids,
+    int num_tokens,
+    int num_classes) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
+    int class_idx = i % num_classes;
+    int token_idx = i / num_classes;
+    input_grad[i] = output_grad[i];
+    if (class_idx == token_ids[token_idx]) {
+      input_grad[i] = input_grad[i] - (DT)1.0f;
+    }
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     hipStream_t stream) {
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+    // shift labels by 1 position to the left (ignore first token label)
+    for (int j = 0; j < num_bwd_tokens; j++) {
+      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+    }
+
+    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+    // ignore last token
+    checkCUDA(hipMemsetAsync(input_grad_ptr +
+                                 (tokens_previous_requests +
+                                  bc->requestsInfo[i].num_tokens_in_batch - 1) *
+                                     num_classes,
+                             0,
+                             num_classes * sizeof(DT),
+                             stream));
+    checkCUDA(hipMemcpyAsync(m->handle.workSpace,
+                             token_ids,
+                             sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                             hipMemcpyHostToDevice,
+                             stream));
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(sparse_categorical_crossentropy_loss_peft_backward<DT>),
+        GET_BLOCKS(num_bwd_tokens * num_classes),
+        CUDA_NUM_THREADS,
+        0,
+        stream,
+        input_grad_ptr + tokens_previous_requests * num_classes,
+        output_grad_ptr + tokens_previous_requests * num_classes,
+        static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
+        num_bwd_tokens,
+        num_classes);
+    // scale
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(scale_kernel<DT>),
+                       GET_BLOCKS(num_bwd_tokens * num_classes),
+                       CUDA_NUM_THREADS,
+                       0,
+                       stream,
+                       input_grad_ptr + tokens_previous_requests * num_classes,
+                       num_bwd_tokens * num_classes,
+                       DT(0.0),
+                       scale_factor);
+
+    tokens_previous_requests += num_bwd_tokens + 1;
+  }
+  assert(tokens_previous_requests == bc->num_active_tokens());
+}
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index e47006cc9d..16f1219bf6 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -24,7 +24,7 @@ using Legion::Domain;
 SoftmaxMeta::SoftmaxMeta(FFHandler handler,
                          Softmax const *softmax,
                          Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, softmax) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
       inputTensor, input_domain, softmax->data_type));
@@ -40,10 +40,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
 namespace Kernels {
 namespace Softmax {
 
-template <typename DT>
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr) {
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -52,7 +51,15 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  Internal::forward_kernel(m, input_ptr, output_ptr, stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::forward_kernel(
+        m, input.get_float_ptr(), output.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::forward_kernel(
+        m, input.get_half_ptr(), output.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -68,11 +75,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template <typename DT>
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
-                             size_t num_elements) {
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
@@ -82,8 +87,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  Internal::backward_kernel(
-      input_grad_ptr, output_grad_ptr, num_elements, stream);
+  assert(input_grad.domain == output_grad.domain);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::backward_kernel(m,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::backward_kernel(m,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -99,21 +118,113 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                            float const *input_ptr,
-                                            float *output_ptr);
-template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                           half const *input_ptr,
-                                           half *output_ptr);
-
-template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                             float *input_grad_ptr,
-                                             float const *output_grad_ptr,
-                                             size_t num_elements);
-template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                            half *input_grad_ptr,
-                                            half const *output_grad_ptr,
-                                            size_t num_elements);
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              bool is_last_op,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_float_ptr(),
+                               output.get_float_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(),
+                                output.get_float_ptr(),
+                                output.domain.get_volume() * sizeof(float),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_half_ptr(),
+                               output.get_half_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(),
+                                output.get_half_ptr(),
+                                output.domain.get_volume() * sizeof(half),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              num_classes,
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              num_classes,
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
 namespace Internal {
 template <typename DT>
 void forward_kernel(SoftmaxMeta const *m,
@@ -135,7 +246,8 @@ void forward_kernel(SoftmaxMeta const *m,
 }
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      cudaStream_t stream) {
@@ -146,6 +258,115 @@ void backward_kernel(DT *input_grad_ptr,
                             stream));
 }
 
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_classes,
+                      cudaStream_t stream) {
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  float alpha = 1.0f, beta = 0.0f;
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
+                                        CUDNN_TENSOR_NCHW,
+                                        cudnn_data_type,
+                                        bc->num_active_tokens(),
+                                        num_classes,
+                                        1,
+                                        1));
+  checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
+                                 CUDNN_SOFTMAX_ACCURATE,
+                                 CUDNN_SOFTMAX_MODE_CHANNEL,
+                                 &alpha,
+                                 m->outputTensor,
+                                 input_ptr,
+                                 &beta,
+                                 m->outputTensor,
+                                 output_ptr));
+}
+
+template <typename DT>
+__global__ void sparse_categorical_crossentropy_loss_peft_backward(
+    DT *input_grad,
+    DT const *output_grad,
+    BatchConfig::TokenId const *token_ids,
+    int num_tokens,
+    int num_classes) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
+    int class_idx = i % num_classes;
+    int token_idx = i / num_classes;
+    input_grad[i] = output_grad[i];
+    if (class_idx == token_ids[token_idx]) {
+      input_grad[i] = input_grad[i] - (DT)1.0f;
+    }
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     cudaStream_t stream) {
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+    // shift labels by 1 position to the left (ignore first token label)
+    for (int j = 0; j < num_bwd_tokens; j++) {
+      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+    }
+
+    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+    // ignore last token
+    checkCUDA(cudaMemsetAsync(
+        input_grad_ptr + (tokens_previous_requests +
+                          bc->requestsInfo[i].num_tokens_in_batch - 1) *
+                             num_classes,
+        0,
+        num_classes * sizeof(DT),
+        stream));
+    checkCUDA(cudaMemcpyAsync(m->handle.workSpace,
+                              token_ids,
+                              sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
+    sparse_categorical_crossentropy_loss_peft_backward<<<
+        GET_BLOCKS(num_bwd_tokens * num_classes),
+        CUDA_NUM_THREADS,
+        0,
+        stream>>>(
+        input_grad_ptr + tokens_previous_requests * num_classes,
+        output_grad_ptr + tokens_previous_requests * num_classes,
+        static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
+        num_bwd_tokens,
+        num_classes);
+    // scale
+    scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
+                   CUDA_NUM_THREADS,
+                   0,
+                   stream>>>(input_grad_ptr +
+                                 tokens_previous_requests * num_classes,
+                             num_bwd_tokens * num_classes,
+                             DT(0.0),
+                             scale_factor);
+
+    tokens_previous_requests += num_bwd_tokens + 1;
+  }
+  assert(tokens_previous_requests == bc->num_active_tokens());
+}
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/src/ops/kernels/transpose_kernels.cpp b/src/ops/kernels/transpose_kernels.cpp
index 49a7d827f5..199e1cd0c1 100644
--- a/src/ops/kernels/transpose_kernels.cpp
+++ b/src/ops/kernels/transpose_kernels.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/transpose_kernels.h"
+#include "flexflow/ops/transpose.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -22,6 +23,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Domain;
 
+TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose)
+    : OpMeta(handler, transpose) {}
+
 struct TransposeStrides {
   int num_dim;
   int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM],
diff --git a/src/ops/kernels/transpose_kernels.cu b/src/ops/kernels/transpose_kernels.cu
index b401ff0ba1..18a6e405af 100644
--- a/src/ops/kernels/transpose_kernels.cu
+++ b/src/ops/kernels/transpose_kernels.cu
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/transpose_kernels.h"
+#include "flexflow/ops/transpose.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -21,6 +22,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Domain;
 
+TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose)
+    : OpMeta(handler, transpose) {}
+
 struct TransposeStrides {
   int num_dim;
   int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM],
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index b19f400eb2..3161987d60 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/layer_norm.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/hash_utils.h"
 #include "legion/legion_utilities.h"
@@ -56,7 +57,7 @@ LayerNormParams LayerNorm::get_params() const {
   params.elementwise_affine = this->elementwise_affine;
   params.eps = this->eps;
   params.use_bias = this->use_bias;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -561,7 +562,7 @@ void LayerNorm::inference_task(Task const *task,
     assert(regions.size() == 2);
   }
 
-  LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta);
+  LayerNorm::inference_kernel_wrapper(m, bc, in, out, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -645,6 +646,104 @@ void LayerNorm::forward_task(Task const *task,
   LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta);
 }
 
+Legion::FutureMap
+    LayerNorm::peft_bwd(FFModel const &ff,
+                        BatchConfigFuture const &bc,
+                        std::vector<ParallelTensor> const &batch_inputs,
+                        std::vector<ParallelTensor> const &batch_outputs,
+                        MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I/O): input_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  if (elementwise_affine) {
+    // regions[2](I): gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(3, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I/O): input_grad
+  regions[2](I): gamma
+*/
+void LayerNorm::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
+  assert(task->regions.size() == regions.size());
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+
+  Domain out_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain in_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 3));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[2],
+                                             task->regions[2],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    Domain gamma_domain = runtime->get_index_space_domain(
+        ctx, task->regions[2].region.get_index_space());
+
+    assert(gamma_domain.get_volume() == m->effective_num_elements);
+  } else {
+    assert(regions.size() == 2);
+  }
+  LayerNorm::peft_bwd_kernel_wrapper(m, output_grad, input_grad, gamma);
+}
+
 void LayerNorm::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -722,55 +821,60 @@ void LayerNorm::backward_task(Task const *task,
                               Runtime *runtime) {
   LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
   assert(task->regions.size() == regions.size());
-  float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL;
-  float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
   Domain out_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  out_grad_ptr = helperGetTensorPointerRO<float>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
-  in_ptr = helperGetTensorPointerRO<float>(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
   Domain in_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[2].region.get_index_space());
-  in_grad_ptr = helperGetTensorPointerRW<float>(
-      regions[2], task->regions[2], FID_DATA, ctx, runtime);
   assert(in_domain == out_grad_domain);
   assert(in_domain.get_volume() ==
          m->effective_num_elements * m->effective_batch_size);
+
   if (m->elementwise_affine) {
     assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[3],
+                                             task->regions[3],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad = helperGetGenericTensorAccessorRW(m->output_type[0],
+                                                  regions[4],
+                                                  task->regions[4],
+                                                  FID_DATA,
+                                                  ctx,
+                                                  runtime);
     Domain gamma_domain = runtime->get_index_space_domain(
         ctx, task->regions[3].region.get_index_space());
-    gamma_ptr = helperGetTensorPointerRO<float>(
-        regions[3], task->regions[3], FID_DATA, ctx, runtime);
     Domain gamma_grad_domain = runtime->get_index_space_domain(
         ctx, task->regions[4].region.get_index_space());
-    gamma_grad_ptr = helperGetTensorPointerRW<float>(
-        regions[4], task->regions[4], FID_DATA, ctx, runtime);
     if (m->use_bias) {
       Domain beta_grad_domain = runtime->get_index_space_domain(
           ctx, task->regions[5].region.get_index_space());
-      beta_grad_ptr = helperGetTensorPointerRW<float>(
-          regions[5], task->regions[5], FID_DATA, ctx, runtime);
+      beta_grad = helperGetGenericTensorAccessorRW(m->output_type[0],
+                                                   regions[5],
+                                                   task->regions[5],
+                                                   FID_DATA,
+                                                   ctx,
+                                                   runtime);
       assert(gamma_domain == beta_grad_domain);
     }
-
     assert(gamma_domain == gamma_grad_domain);
-
     assert(gamma_domain.get_volume() == m->effective_num_elements);
   } else {
     assert(regions.size() == 3);
   }
-
-  LayerNorm::backward_kernel_wrapper<float>(m,
-                                            out_grad_ptr,
-                                            in_ptr,
-                                            in_grad_ptr,
-                                            gamma_ptr,
-                                            gamma_grad_ptr,
-                                            beta_grad_ptr);
+  LayerNorm::backward_kernel_wrapper(
+      m, output_grad, input, input_grad, gamma, gamma_grad, beta_grad);
 }
 
 bool LayerNorm::measure_operator_cost(Simulator *sim,
@@ -785,7 +889,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
   }
   Domain input_domain = sub_input.get_domain();
   Domain output_domain = sub_output.get_domain();
-  LayerNormMeta *m = sim->layernorm_meta;
+  MemoryAllocator gpu_mem_allocator(sim->memory);
+  LayerNormMeta *m = new LayerNormMeta(sim->handler, this, gpu_mem_allocator);
 
   sim->free_all();
   float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
@@ -821,16 +926,24 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
   if (sim->computationMode == COMP_MODE_TRAINING) {
     float *in_grad_ptr =
         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW in_grad_acc(
+        inputs[0]->data_type, input_domain, in_grad_ptr);
     assert(in_grad_ptr != NULL);
     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
     float *out_grad_ptr = NULL;
     out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+    GenericTensorAccessorR out_grad_acc(
+        outputs[0]->data_type, output_domain, out_grad_ptr);
     assert(out_grad_ptr != NULL);
     cost_metrics.outputs_memory +=
         cost_metrics.total_mem_diff_from(sim->offset);
 
     float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
+    GenericTensorAccessorW gamma_grad_acc(
+        outputs[0]->data_type, output_domain, gamma_grad_ptr);
+    GenericTensorAccessorW beta_grad_acc(
+        outputs[0]->data_type, output_domain, beta_grad_ptr);
 
     out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) ||
                     (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) &&
@@ -842,13 +955,13 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
     }
 
     backward = [=] {
-      backward_kernel_wrapper<float>(m,
-                                     out_grad_ptr,
-                                     in_ptr,
-                                     in_grad_ptr,
-                                     gamma_ptr,
-                                     gamma_grad_ptr,
-                                     beta_grad_ptr);
+      backward_kernel_wrapper(m,
+                              out_grad_acc,
+                              input1_acc,
+                              in_grad_acc,
+                              gamma_acc,
+                              gamma_grad_acc,
+                              beta_grad_acc);
     };
   }
 
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 07dbdb3dfb..27d314e21e 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/layer_norm.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -27,21 +28,37 @@ constexpr int kColwiseReduceTileSize = 32;
 LayerNormMeta::LayerNormMeta(FFHandler handle,
                              LayerNorm const *ln,
                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
+  use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
   effective_num_elements = ln->effective_num_elements;
-  use_bias = ln->use_bias;
+  profiling = ln->profiling;
+  inference_debugging = ln->inference_debugging;
   eps = ln->eps;
-  checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size));
+  DataType data_type = ln->data_type;
+  size_t totalSize = effective_batch_size * data_type_size(data_type) * 6;
+  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  ds_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  db_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  scale_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
-LayerNormMeta::~LayerNormMeta(void) {}
+LayerNormMeta::~LayerNormMeta(void) {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
 
 template <typename T>
 __device__ __forceinline__ T WARP_SHFL_DOWN(T value,
@@ -74,7 +91,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -82,8 +99,14 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
 }
 
 template <typename T>
-__global__ void RowwiseMomentsCUDAKernel(
-    int64_t N, float eps, T const *X, T *mean, T *rstd) {
+__global__ void LayerNormFusedForwardKernel(int64_t N,
+                                            float eps,
+                                            T const *X,
+                                            T *mean,
+                                            T *rstd,
+                                            T const *gamma,
+                                            T const *beta,
+                                            T *Y) {
   __shared__ float m_shared[C10_WARP_SIZE];
   __shared__ float v_shared[C10_WARP_SIZE];
   const int64_t i = blockIdx.x;
@@ -103,18 +126,10 @@ __global__ void RowwiseMomentsCUDAKernel(
     mean[i] = static_cast<T>(sum1);
     rstd[i] = static_cast<T>(rsqrt(sum2 + eps));
   }
-}
 
-template <typename T>
-__global__ void LayerNormForwardCUDAKernel(int64_t N,
-                                           T const *X,
-                                           T const *mean,
-                                           T const *rstd,
-                                           T const *gamma,
-                                           T const *beta,
-                                           T *Y) {
+  __syncthreads();
+
   using T_ACC = T;
-  const int64_t i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
@@ -135,28 +150,19 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m,
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                hipStream_t stream) {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel<T>),
-                     m->effective_batch_size,
-                     kCUDABlockReduceNumThreads,
-                     0,
-                     stream,
-                     m->effective_num_elements,
-                     m->eps,
-                     in_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr));
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel<T>),
-                     m->effective_batch_size,
-                     kCUDANumThreads,
-                     0,
-                     stream,
-                     m->effective_num_elements,
-                     in_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr),
-                     gamma_ptr,
-                     beta_ptr,
-                     out_ptr);
+
+  LayerNormFusedForwardKernel<T>
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   in_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   out_ptr);
 }
 
 /*static*/
@@ -167,24 +173,154 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
                                        GenericTensorAccessorR const &beta) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
   if (m->input_type[0] == DT_FLOAT) {
-    LayerNorm::forward_kernel<float>(m,
-                                     input.get_float_ptr(),
-                                     output.get_float_ptr(),
-                                     gamma.get_float_ptr(),
-                                     m->use_bias ? beta.get_float_ptr()
-                                                 : nullptr,
-                                     stream);
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
   } else if (m->input_type[0] == DT_HALF) {
-    LayerNorm::forward_kernel<half>(m,
-                                    input.get_half_ptr(),
-                                    output.get_half_ptr(),
-                                    gamma.get_half_ptr(),
-                                    m->use_bias ? beta.get_half_ptr() : nullptr,
-                                    stream);
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
+}
+
+/*static*/
+void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
+                                         BatchConfig const *bc,
+                                         GenericTensorAccessorR const &input,
+                                         GenericTensorAccessorW &output,
+                                         GenericTensorAccessorR const &gamma,
+                                         GenericTensorAccessorR const &beta) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
+  } else {
+    assert(false && "unsupport datatype in layernorm");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
 }
 
 template <typename T>
@@ -224,7 +360,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   using T_ACC = T;
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < M) {
-    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
     const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
                     static_cast<T_ACC>(rstd[index]) *
                     static_cast<T_ACC>(rstd[index]) *
@@ -235,27 +371,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   }
 }
 
-template <typename T>
-__global__ void LayerNormBackwardCUDAKenrel(int64_t N,
-                                            T const *dY,
-                                            T const *X,
-                                            T const *gamma,
-                                            T const *a,
-                                            T const *b,
-                                            T const *c,
-                                            T *dX) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    dX[index] =
-        static_cast<T_ACC>(a[i]) * static_cast<T_ACC>(dY[index]) * gamma_v +
-        b[i] * static_cast<T_ACC>(X[index]) + c[i];
-  }
-}
-
 template <typename T>
 __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
                                                   int64_t N,
@@ -452,116 +567,148 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m,
                                 hipStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
-                     M,
-                     kCUDABlockReduceNumThreads,
-                     0,
-                     stream,
-                     N,
-                     output_grad_ptr,
-                     input_ptr,
-                     gamma_ptr,
-                     static_cast<T *>(m->ds_ptr),
-                     static_cast<T *>(m->db_ptr));
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          input_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
   const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
-                     B,
-                     kCUDANumThreads,
-                     0,
-                     stream,
-                     M,
-                     N,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr),
-                     static_cast<T *>(m->ds_ptr),
-                     static_cast<T *>(m->db_ptr),
-                     static_cast<T *>(m->scale_ptr),
-                     static_cast<T *>(m->bias_ptr));
-
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
   int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      input_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
 
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
-                     blocks,
-                     num_threads,
-                     nshared,
-                     stream,
-                     output_grad_ptr,
-                     input_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr),
-                     gamma_ptr,
-                     input_grad_ptr,
-                     N);
   if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
     if (M < 512) {
       // For small batch size, do colwise reduce directly
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel<T>),
-                         B,
-                         kCUDANumThreads,
-                         0,
-                         stream,
-                         M,
-                         N,
-                         output_grad_ptr,
-                         input_ptr,
-                         static_cast<T *>(m->mean_ptr),
-                         static_cast<T *>(m->rstd_ptr),
-                         gamma_grad_ptr,
-                         beta_grad_ptr);
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              input_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
     } else {
       const int64_t B =
           (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
       constexpr int kThreadX = kColwiseReduceTileSize;
       constexpr int kThreadY = kColwiseReduceTileSize / 2;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel<T>),
-                         B,
-                         dim3(kThreadX, kThreadY),
-                         0,
-                         stream,
-                         M,
-                         N,
-                         output_grad_ptr,
-                         input_ptr,
-                         static_cast<T *>(m->mean_ptr),
-                         static_cast<T *>(m->rstd_ptr),
-                         gamma_grad_ptr,
-                         beta_grad_ptr);
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              input_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
     }
   }
 }
 
 /*static*/
 template <typename T>
-void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
-                                        T const *output_grad_ptr,
-                                        T const *input_ptr,
-                                        T *input_grad_ptr,
-                                        T const *gamma_ptr,
-                                        T *gamma_grad_ptr,
-                                        T *beta_grad_ptr) {
+void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
+                                T const *output_grad_ptr,
+                                T *input_grad_ptr,
+                                T const *gamma_ptr,
+                                hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
+}
+
+/*static*/
+void LayerNorm::peft_bwd_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  LayerNorm::backward_kernel<float>(m,
-                                    output_grad_ptr,
-                                    input_ptr,
-                                    input_grad_ptr,
-                                    gamma_ptr,
-                                    gamma_grad_ptr,
-                                    beta_grad_ptr,
-                                    stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               stream);
+  } else {
+    assert(m->output_type[0] == DT_HALF);
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               stream);
+  }
 }
 
-template void
-    LayerNorm::backward_kernel_wrapper<float>(LayerNormMeta const *m,
-                                              float const *output_grad_ptr,
-                                              float const *input_ptr,
-                                              float *input_grad_ptr,
-                                              float const *gamma_ptr,
-                                              float *gamma_grad_ptr,
-                                              float *beta_grad_ptr);
+/*static*/
+void LayerNorm::backward_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               gamma_grad.get_float_ptr(),
+                               beta_grad.get_float_ptr(),
+                               stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               gamma_grad.get_half_ptr(),
+                               beta_grad.get_half_ptr(),
+                               stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 44979c48fe..0801d11617 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32;
 LayerNormMeta::LayerNormMeta(FFHandler handle,
                              LayerNorm const *ln,
                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
@@ -50,6 +50,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 LayerNormMeta::~LayerNormMeta(void) {
@@ -96,73 +97,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
-#ifdef DEADCODE
-template <typename T>
-__global__ void RowwiseMomentsCUDAKernel(
-    int64_t N, float eps, T const *X, T *mean, T *rstd) {
-  __shared__ float m_shared[C10_WARP_SIZE];
-  __shared__ float v_shared[C10_WARP_SIZE];
-  const int64_t i = blockIdx.x;
-  float sum1 = 0.0f;
-  float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    sum1 += static_cast<float>(X[index]);
-    sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
-  }
-  sum1 = BlockReduceSum<float>(sum1, m_shared);
-  sum2 = BlockReduceSum<float>(sum2, v_shared);
-  if (threadIdx.x == 0) {
-    float const scale = float(1) / static_cast<float>(N);
-    sum1 *= scale;
-    sum2 = max(sum2 * scale - sum1 * sum1, float(0));
-    mean[i] = static_cast<T>(sum1);
-    rstd[i] = static_cast<T>(rsqrt(sum2 + eps));
-  }
-}
-
-template <typename T>
-__global__ void LayerNormForwardCUDAKernel(int64_t N,
-                                           T const *X,
-                                           T const *mean,
-                                           T const *rstd,
-                                           T const *gamma,
-                                           T const *beta,
-                                           T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    const T_ACC beta_v =
-        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
-    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
-                   static_cast<T_ACC>(rstd[i]) * gamma_v +
-               beta_v;
-  }
-}
-#endif
-
 template <typename T>
 __global__ void LayerNormFusedForwardKernel(int64_t N,
                                             float eps,
@@ -177,18 +111,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -200,7 +129,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -221,25 +150,18 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m,
                                T const *beta_ptr,
                                cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   LayerNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               m->eps,
-                                               in_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               out_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   in_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   out_ptr);
 }
 
 /*static*/
@@ -290,6 +212,116 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
   }
 }
 
+/*static*/
+void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
+                                         BatchConfig const *bc,
+                                         GenericTensorAccessorR const &input,
+                                         GenericTensorAccessorW &output,
+                                         GenericTensorAccessorR const &gamma,
+                                         GenericTensorAccessorR const &beta) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
+  } else {
+    assert(false && "unsupport datatype in layernorm");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
+}
+
 template <typename T>
 __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
@@ -327,7 +359,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   using T_ACC = T;
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < M) {
-    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
     const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
                     static_cast<T_ACC>(rstd[index]) *
                     static_cast<T_ACC>(rstd[index]) *
@@ -338,27 +370,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   }
 }
 
-template <typename T>
-__global__ void LayerNormBackwardCUDAKenrel(int64_t N,
-                                            T const *dY,
-                                            T const *X,
-                                            T const *gamma,
-                                            T const *a,
-                                            T const *b,
-                                            T const *c,
-                                            T *dX) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    dX[index] =
-        static_cast<T_ACC>(a[i]) * static_cast<T_ACC>(dY[index]) * gamma_v +
-        b[i] * static_cast<T_ACC>(X[index]) + c[i];
-  }
-}
-
 template <typename T>
 __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
                                                   int64_t N,
@@ -620,44 +631,83 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m,
 
 /*static*/
 template <typename T>
-void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
-                                        T const *output_grad_ptr,
-                                        T const *input_ptr,
-                                        T *input_grad_ptr,
-                                        T const *gamma_ptr,
-                                        T *gamma_grad_ptr,
-                                        T *beta_grad_ptr) {
+void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
+                                T const *output_grad_ptr,
+                                T *input_grad_ptr,
+                                T const *gamma_ptr,
+                                cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
+}
+
+/*static*/
+void LayerNorm::peft_bwd_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   if (m->output_type[0] == DT_FLOAT) {
-    LayerNorm::backward_kernel<float>(m,
-                                      output_grad_ptr,
-                                      input_ptr,
-                                      input_grad_ptr,
-                                      gamma_ptr,
-                                      gamma_grad_ptr,
-                                      beta_grad_ptr,
-                                      stream);
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               stream);
+  } else {
+    assert(m->output_type[0] == DT_HALF);
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               stream);
   }
-  // }else if(m->output_type[0] == DT_HALF){
-  //   LayerNorm::backward_kernel<half>(m,
-  //                                   output_grad_ptr,
-  //                                   input_ptr,
-  //                                   input_grad_ptr,
-  //                                   gamma_ptr,
-  //                                   gamma_grad_ptr,
-  //                                   beta_grad_ptr,
-  //                                   stream);
-  // }
 }
 
-template void
-    LayerNorm::backward_kernel_wrapper<float>(LayerNormMeta const *m,
-                                              float const *output_grad_ptr,
-                                              float const *input_ptr,
-                                              float *input_grad_ptr,
-                                              float const *gamma_ptr,
-                                              float *gamma_grad_ptr,
-                                              float *beta_grad_ptr);
+/*static*/
+void LayerNorm::backward_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               gamma_grad.get_float_ptr(),
+                               beta_grad.get_float_ptr(),
+                               stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               gamma_grad.get_half_ptr(),
+                               beta_grad.get_half_ptr(),
+                               stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 44b56d623e..20ad762b62 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -498,7 +498,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
   m->add_bias_only_once = linear->add_bias_only_once;
   m->profiling = linear->profiling;
   m->inference_debugging = linear->inference_debugging;
-  m->trainableInputs[0] = linear->trainableInputs[0];
+  m->trainable_inputs[0] = linear->trainable_inputs[0];
   m->weight_ptr_type = m->input_type[0];
   m->quantization_type = linear->quantization_type;
   m->offload = linear->offload;
@@ -632,8 +632,11 @@ void Linear::inference_task(Task const *task,
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  assert((weight.domain.hi()[0] - weight.domain.lo()[0] + 1) == in_dim);
+  assert((weight.domain.hi()[1] - weight.domain.lo()[1] + 1) == out_dim);
+  assert(weight.domain.get_volume() == in_dim * out_dim);
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   GenericTensorAccessorR bias;
   if (m->use_bias &&
       !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) {
@@ -645,14 +648,15 @@ void Linear::inference_task(Task const *task,
                                             runtime);
     assert(bias.domain.get_volume() == static_cast<size_t>(out_dim));
   }
-  forward_kernel_wrapper(m,
-                         input.ptr,
-                         output.ptr,
-                         weight.ptr,
-                         bias.ptr,
-                         in_dim,
-                         out_dim,
-                         batch_size);
+  inference_kernel_wrapper(m,
+                           bc,
+                           input.ptr,
+                           output.ptr,
+                           weight.ptr,
+                           bias.ptr,
+                           in_dim,
+                           out_dim,
+                           batch_size);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -664,6 +668,119 @@ void Linear::inference_task(Task const *task,
     }
     Linear::save_inference_tensors_to_file(
         m, shard_id, bc, {input}, weights_accessors, {output});
+    printf("\tin=[%i,%i].T @ w=[%i,%i] -> out=[%i,%i]\n",
+           in_dim,
+           bc->num_tokens,
+           in_dim,
+           out_dim,
+           out_dim,
+           bc->num_tokens);
+  }
+}
+
+FutureMap Linear::peft_bwd(FFModel const &ff,
+                           BatchConfigFuture const &bc,
+                           std::vector<ParallelTensor> const &batch_inputs,
+                           std::vector<ParallelTensor> const &batch_outputs,
+                           MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void Linear::peft_bwd_task(Task const *task,
+                           std::vector<PhysicalRegion> const &regions,
+                           Context ctx,
+                           Runtime *runtime) {
+  Domain input_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  LinearMeta *m = *((LinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3);
+  if (m->quantization_type == DT_NONE) {
+    assert(m->input_type[0] == m->weight_type[0]);
+  }
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+
+  int num_infr_tokens = bc->num_active_infr_tokens();
+  int num_peft_tokens = bc->num_active_peft_tokens();
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Linear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false, true);
+    printf("\tw=[%i,%i] @ out_grad=[%i,%i] -> in_grad[%i,%i]\n",
+           in_dim,
+           out_dim,
+           out_dim,
+           num_peft_tokens,
+           in_dim,
+           num_peft_tokens);
+  }
+  peft_bwd_kernel_wrapper(m,
+                          input_grad.ptr,
+                          output_grad.ptr,
+                          weight.ptr,
+                          in_dim,
+                          out_dim,
+                          num_infr_tokens,
+                          num_peft_tokens);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Linear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
   }
 }
 
@@ -782,7 +899,7 @@ void Linear::backward(FFModel const &ff) {
     launcher.add_field(rid++, FID_DATA);
     // regions[1](I/O): replica_grad
     assert(replica == NULL);
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       launcher.add_region_requirement(
           RegionRequirement(inputs[0]->part_grad,
                             0 /*projection id*/,
@@ -878,17 +995,17 @@ void Linear::backward_task_with_dim(Task const *task,
                                     Runtime *runtime) {
   // Linear* linear = (Linear*) task->args;
   LinearMeta const *m = *((LinearMeta **)task->local_args);
-  assert(regions.size() == (5 + static_cast<size_t>(m->trainableInputs[0]) +
+  assert(regions.size() == (5 + static_cast<size_t>(m->trainable_inputs[0]) +
                             static_cast<size_t>(m->use_bias)));
   assert(task->regions.size() ==
-         (5 + static_cast<size_t>(m->trainableInputs[0]) +
+         (5 + static_cast<size_t>(m->trainable_inputs[0]) +
           static_cast<size_t>(m->use_bias)));
   DT *input_grad = nullptr;
   size_t rid = 0;
   TensorAccessorR<DT, NDIM> acc_input(
       regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
   rid++;
-  if (m->trainableInputs[0]) {
+  if (m->trainable_inputs[0]) {
     Domain domain = runtime->get_index_space_domain(
         ctx, task->regions[rid].region.get_index_space());
     if (domain.get_dim() == NDIM + 1) {
@@ -1119,7 +1236,10 @@ bool Linear::measure_operator_cost(Simulator *sim,
   int input_n = sub_input.get_volume() / input_c;
   int output_c = sub_output.dims[0].size;
   int output_n = sub_output.get_volume() / output_c;
-  LinearMeta *m = sim->linear_meta;
+
+  MemoryAllocator gpu_mem_allocator(sim->memory);
+  LinearMeta *m = new LinearMeta(
+      sim->handler, output_n, this, gpu_mem_allocator, input_c * output_c);
   m->activation = activation;
   m->kernel_reg_type = kernel_reg_type;
   m->kernel_reg_lambda = kernel_reg_lambda;
@@ -1164,7 +1284,7 @@ bool Linear::measure_operator_cost(Simulator *sim,
   };
   if (sim->computationMode == COMP_MODE_TRAINING) {
     void *input_grad_ptr = NULL;
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       input_grad_ptr =
           sim->allocate(sub_input.get_volume(), inputs[0]->data_type);
     } else {
@@ -1313,7 +1433,7 @@ LinearParams Linear::get_params() const {
   params.kernel_reg_lambda = this->kernel_reg_lambda;
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
 
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
new file mode 100644
index 0000000000..fde6bc2b28
--- /dev/null
+++ b/src/ops/lora_linear.cc
@@ -0,0 +1,1316 @@
+#include "flexflow/ops/lora_linear.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/layer.h"
+#include "flexflow/model.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/utils/hash_utils.h"
+#include "flexflow/utils/peft_weight_allocator.h"
+#include "legion/legion_utilities.h"
+#include <sys/stat.h>
+#include <sys/types.h>
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
+namespace FlexFlow {
+
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+
+using namespace FlexFlow::Kernels::LoraLinear;
+
+bool check_lora_layer_match(Layer *potential_target,
+                            std::string target_module_name) {
+  if (potential_target->op_type == OP_LINEAR &&
+      potential_target->name != nullptr && strlen(potential_target->name) > 0) {
+    std::string s(potential_target->name);
+    if (s.find(target_module_name) != std::string::npos &&
+        s.find("lora") == std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
+  if (peft_config.target_modules.size() == 0) {
+    printf("PEFT config does not contain any target module\n");
+    std::cout << peft_config << std::endl;
+    assert(false);
+  }
+  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
+  peft_configs[*peft_model_id] = peft_config;
+
+  for (std::string target_module_name : peft_config.target_modules) {
+    assert(target_module_name.length() > 0 &&
+           "LoRA target module name is empty");
+    // find target layer
+    for (auto it = layers.begin(); it != layers.end(); ++it) {
+      Layer *target_module = *it;
+      bool match = check_lora_layer_match(target_module, target_module_name);
+      if (!match) {
+        continue;
+      }
+
+      if (base_layer_to_peft_layer.find(target_module) !=
+          base_layer_to_peft_layer.end()) {
+        // lora linear layer already added, no need to add again
+        Layer *peft_layer = base_layer_to_peft_layer[target_module];
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+      } else {
+        Tensor const input = target_module->inputs[0];
+        Tensor const output = target_module->outputs[0];
+        assert(input->data_type == output->data_type);
+        std::string name_ = target_module->name
+                                ? std::string(target_module->name)
+                                : std::string("");
+        size_t last_underscore = name_.length() - 1;
+        for (int i = name_.length() - 1; i > 0; i--) {
+          if (!(std::isdigit(target_module->name[i]) ||
+                target_module->name[i] == '_')) {
+            break;
+          } else if (target_module->name[i] == '_') {
+            last_underscore = i;
+          }
+        }
+        name_.erase(last_underscore);
+
+        name_ += ".lora";
+        std::cout << "Adding layer " << name_ << std::endl;
+        Layer *peft_layer = new Layer(this,
+                                      OP_LORA,
+                                      output->data_type,
+                                      name_.c_str(),
+                                      2 /*inputs*/,
+                                      0 /*weights*/,
+                                      1 /*outputs*/,
+                                      input,
+                                      output);
+        // fix LoRA layer's transformer layer ID and model ID
+        peft_layer->layer_guid.transformer_layer_id =
+            target_module->layer_guid.transformer_layer_id;
+        peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
+        {
+          int numdims = output->num_dims;
+          int dims[MAX_TENSOR_DIM];
+          for (int i = 0; i < numdims; i++) {
+            dims[i] = output->dims[i];
+          }
+          peft_layer->outputs[0] =
+              create_tensor_legion_ordering(numdims,
+                                            dims,
+                                            output->data_type,
+                                            peft_layer,
+                                            0,
+                                            true /*create_grad*/);
+        }
+        it = layers.insert(it + 1, peft_layer);
+        ++it;
+        base_layer_to_peft_layer[target_module] = peft_layer;
+        peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+      }
+    }
+  }
+
+  // save finetuned lora model configs to file
+  if (peft_config.trainable) {
+    std::string finetuned_model_folder = join_path({
+        peft_config.cache_folder,
+        "finetuned_models",
+        peft_config.peft_model_id,
+    });
+    fs::remove_all(finetuned_model_folder);
+    std::string finetuned_model_config_folder = join_path({
+        finetuned_model_folder,
+        "config",
+    });
+    fs::create_directories(finetuned_model_config_folder);
+    std::string lora_linear_config_filepath = join_path({
+        finetuned_model_config_folder,
+        "ff_config.json",
+    });
+    serialize_to_json_file(peft_config, lora_linear_config_filepath);
+    std::string optimizer_config_filepath = join_path({
+        finetuned_model_config_folder,
+        "ff_optimizer_config.json",
+    });
+    if (typeid(*peft_config.optimizer_config) ==
+        typeid(LoraSGDOptimizerConfig)) {
+      LoraSGDOptimizerConfig const *sgd_config =
+          static_cast<LoraSGDOptimizerConfig const *>(
+              peft_config.optimizer_config);
+      serialize_to_json_file(*sgd_config, optimizer_config_filepath);
+    } else if (typeid(*peft_config.optimizer_config) ==
+               typeid(LoraAdamOptimizerConfig)) {
+      LoraAdamOptimizerConfig const *adam_config =
+          static_cast<LoraAdamOptimizerConfig const *>(
+              peft_config.optimizer_config);
+      serialize_to_json_file(*adam_config, optimizer_config_filepath);
+    } else {
+      assert(false && "Optimizer not supported");
+    }
+  }
+
+  return peft_model_id;
+}
+
+Op *LoraLinear::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
+  std::vector<PEFTModelID> const &peft_ids =
+      model.peft_layer_to_peft_id[(Layer *)layer];
+  for (int i = 0; i < peft_ids.size(); i++) {
+    _peft_configs.emplace(
+        std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
+  }
+  return new LoraLinear(model,
+                        layer->layer_guid,
+                        layer->op_type,
+                        inputs[0],
+                        inputs[1],
+                        _peft_configs,
+                        layer->name);
+}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       LoraLinear const &other,
+                       ParallelTensor const input,
+                       ParallelTensor const output)
+    : LoraLinear(model,
+                 other.layer_guid,
+                 other.op_type,
+                 input,
+                 output,
+                 other.peft_configs,
+                 other.name) {}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       Params const &params,
+                       Input const &inputs,
+                       char const *name)
+    : LoraLinear(model,
+                 params.layer_guid,
+                 params.type,
+                 inputs.first,
+                 inputs.second,
+                 params.peft_configs,
+                 params.name) {}
+
+LoraLinear::LoraLinear(
+    FFModel &model,
+    LayerID const &_layer_guid,
+    OperatorType _op_type,
+    ParallelTensor const _input,
+    ParallelTensor const _output,
+    std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+    char const *name)
+    : Op(model,
+         _op_type,
+         _output->data_type,
+         name,
+         2 /*inputs*/,
+         0 /*weights*/,
+         false,
+         1 /*outputs*/,
+         _input,
+         _output) {
+  assert(_input->data_type == _output->data_type);
+  // overwrite layer_guid
+  layer_guid = _layer_guid;
+  data_type = _output->data_type;
+
+  ParallelTensorShape input_shape = this->inputs[0]->get_shape();
+  LoraLinearParams params = this->get_params();
+
+  // Create output tensor
+  {
+    int numdim = inputs[1]->num_dims;
+    ParallelDim dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdim; i++) {
+      dims[i] = inputs[1]->dims[i];
+    }
+    outputs[0] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, inputs[1]->data_type, this);
+  }
+  for (auto const &kv : _peft_configs) {
+    peft_configs.insert(kv);
+  }
+  // assert(check_output_input_weight_parallel_dims(allocate_weights));
+}
+
+void LoraLinear::init(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal init");
+}
+
+void LoraLinear::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, output_tensor);
+  IndexLauncher launcher(LORA_LINEAR_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(LoraLinear)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
+}
+
+template <typename DT>
+void load_peft_from_file(DT *ptr,
+                         size_t num_rows,
+                         size_t num_columns,
+                         int num_shards,
+                         int shard_id,
+                         std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+
+  // HuggingFace dims (serialized in row-major order)
+  //    lora_A: [rank, intermediate_dim]
+  //    lora_B: [hidden_dim, rank]
+  // FlexFlow dims (serialized in column-major order)
+  //    lora_A: [intermediate_dim, rank]
+  //    lora_B: [rank, out_dim]
+  // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B
+  assert(num_rows % num_shards == 0);
+  size_t chunk_size = num_rows / num_shards;
+  size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0;
+
+  // Allocate memory for the weight shard
+  std::vector<DT> host_array(chunk_size * num_columns);
+  // Read the chunk
+  size_t total_size_read = 0;
+  for (int i = 0; i < num_columns; ++i) {
+    in.seekg((i * num_rows + offset) * sizeof(DT));
+    in.read(reinterpret_cast<char *>(host_array.data() + i * chunk_size),
+            chunk_size * sizeof(DT));
+    total_size_read += in.gcount();
+  }
+  // Check weight shard size
+  size_t expected_data_size = chunk_size * num_columns * sizeof(DT);
+  if (total_size_read != expected_data_size) {
+    printf("load weight data error: expected %lu bytes, got: %lu bytes, data "
+           "size: %lu\n",
+           expected_data_size,
+           total_size_read,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(host_array.size() == chunk_size * num_columns);
+  // Copy weight to device memory
+  copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns);
+  in.close();
+}
+
+/*
+  regions[0](O): output
+  regions[1](I): kernel
+  regions[2](I): bias
+*/
+OpMeta *LoraLinear::init_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  LoraLinear const *lora = (LoraLinear *)task->args;
+  FFHandler handle = *((FFHandler const *)task->local_args);
+  GenericTensorAccessorR input =
+      helperGetGenericTensorAccessorRO(lora->inputs[0]->data_type,
+                                       regions[0],
+                                       task->regions[0],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW output =
+      helperGetGenericTensorAccessorRW(lora->outputs[0]->data_type,
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int batch_size = output.domain.get_volume() / out_dim;
+  assert(input.domain.get_volume() == in_dim * batch_size);
+  assert(output.domain.get_volume() == out_dim * batch_size);
+
+  LoraLinearMeta *m = new LoraLinearMeta(handle, lora);
+  m->trainable_inputs[0] = lora->trainable_inputs[0];
+  std::strcpy(m->op_name, lora->name);
+  m->layer_guid = lora->layer_guid;
+
+  int num_shards = lora->inputs[0]->dims[0].degree;
+  int shard_id = task->index_point.point_data[0];
+  int num_dims = lora->inputs[0]->num_dims;
+  assert(in_dim == lora->inputs[0]->dims[0].size / num_shards);
+  assert(out_dim ==
+         lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree);
+
+  DataType dt = m->input_type[0];
+  assert(dt == m->input_type[1]);
+  assert(dt == m->output_type[0]);
+  assert(dt == lora->inputs[0]->data_type);
+  assert(dt == lora->inputs[1]->data_type);
+  assert(dt == lora->outputs[0]->data_type);
+
+  // get layer name
+  assert(lora->name != nullptr &&
+         "Layer name is not set, cannot determine weights location");
+  std::string lora_layername = std::string(lora->name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+
+  for (auto const &kv : lora->peft_configs) {
+    PEFTModelID const &model_id = kv.first;
+    LoraLinearConfig const &lora_config = kv.second;
+
+    int rank = lora_config.rank;
+
+    int w0_num_elements = rank * in_dim;
+    int w1_num_elements = rank * out_dim;
+    // values below represent total weight sizes before sharding. Lora B is not
+    // sharded.
+    int lora_A_num_rows = in_dim * num_shards;
+    int lora_A_num_cols = rank;
+    int lora_B_num_rows = rank;
+    int lora_B_num_cols = out_dim;
+    int lora_A_num_shards = num_shards;
+    int lora_B_num_shards = 1;
+
+    LoraLinearWeight weight;
+    weight.in_dim = in_dim;
+    weight.out_dim = out_dim;
+    weight.rank = rank;
+    weight.num_shards = num_shards;
+    PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
+    weight.w0_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w0_num_elements * data_type_size(dt));
+    weight.w1_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w1_num_elements * data_type_size(dt));
+
+    if (!lora_config.init_lora_weights) {
+      // load weights from file
+      std::string weights_folder_filepath = join_path({
+          lora_config.cache_folder,
+          "weights",
+          lora_config.peft_model_id,
+          dt == DT_FLOAT ? "full-precision" : "half-precision",
+      });
+      std::string w0_filepath = join_path(
+          {weights_folder_filepath, lora_layername_substr + "_A.weight"});
+      std::string w1_filepath = join_path(
+          {weights_folder_filepath, lora_layername_substr + "_B.weight"});
+      if (dt == DT_FLOAT) {
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_A.weight"
+                  << ", num_rows: " << lora_A_num_rows
+                  << ", num_cols: " << lora_A_num_cols
+                  << ", num_shards: " << lora_A_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((float *)weight.w0_ptr,
+                            lora_A_num_rows,
+                            lora_A_num_cols,
+                            lora_A_num_shards,
+                            shard_id,
+                            w0_filepath);
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_B.weight"
+                  << ", num_rows: " << lora_B_num_rows
+                  << ", num_cols: " << lora_B_num_cols
+                  << ", num_shards: " << lora_B_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((float *)weight.w1_ptr,
+                            lora_B_num_rows,
+                            lora_B_num_cols,
+                            lora_B_num_shards,
+                            shard_id,
+                            w1_filepath);
+      } else if (dt == DT_HALF) {
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_A.weight"
+                  << ", num_rows: " << lora_A_num_rows
+                  << ", num_cols: " << lora_A_num_cols
+                  << ", num_shards: " << lora_A_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((half *)weight.w0_ptr,
+                            lora_A_num_rows,
+                            lora_A_num_cols,
+                            lora_A_num_shards,
+                            shard_id,
+                            w0_filepath);
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_B.weight"
+                  << ", num_rows: " << lora_B_num_rows
+                  << ", num_cols: " << lora_B_num_cols
+                  << ", num_shards: " << lora_B_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((half *)weight.w1_ptr,
+                            lora_B_num_rows,
+                            lora_B_num_cols,
+                            lora_B_num_shards,
+                            shard_id,
+                            w1_filepath);
+      } else {
+        assert(false && "Data type not supported");
+      }
+    } else {
+      // initialize weights
+      int seed = 0;
+      init_kernel_wrapper(m, seed);
+    }
+
+    // allocate space for gradients if the LoRA layer is trainable
+    if (lora_config.trainable) {
+      // Ensure we have an optimizer
+      assert(lora_config.optimizer_config != nullptr && "Optimizer not set");
+      assert(typeid(*lora_config.optimizer_config) !=
+                 typeid(LoraOptimizerConfig) &&
+             "Optimizer config is not a subclass of LoraOptimizerConfig");
+      if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+        // Input is partitioned (no replication)
+        // w0_grad is local weight gradients
+        weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
+            model_id, w0_num_elements * data_type_size(dt));
+        // w1_grad is sync weight gradients
+        weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
+            model_id, w1_num_elements * data_type_size(dt));
+      } else {
+        // Input is replicated
+        // w0_grad is sync weight gradients
+        weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
+            model_id, w0_num_elements * data_type_size(dt));
+        // w1_grad is local weight gradients
+        weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
+            model_id, w1_num_elements * data_type_size(dt));
+      }
+      // allocate space for v_values if needed by optimizer
+      if (typeid(*lora_config.optimizer_config) ==
+          typeid(LoraSGDOptimizerConfig)) {
+        LoraSGDOptimizerConfig const *sgd_config =
+            static_cast<LoraSGDOptimizerConfig const *>(
+                lora_config.optimizer_config);
+        if (sgd_config->momentum > 0.0f) {
+          if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+            weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped(
+                model_id, w0_num_elements * data_type_size(dt));
+            weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped(
+                model_id, w1_num_elements * data_type_size(dt));
+          } else {
+            weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped(
+                model_id, w0_num_elements * data_type_size(dt));
+            weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped(
+                model_id, w1_num_elements * data_type_size(dt));
+          }
+        }
+      } else if (typeid(*lora_config.optimizer_config) ==
+                 typeid(LoraAdamOptimizerConfig)) {
+        assert(false && "Adam optim not yet implemented");
+      } else {
+        assert(false && "Optimizer not supported");
+      }
+    }
+    assert(m->model_state.find(model_id) == m->model_state.end());
+    m->model_state[model_id].weights = weight;
+    m->model_state[model_id].optimizer_config = lora_config.optimizer_config;
+    m->model_state[model_id].lora_alpha = lora_config.lora_alpha;
+    m->model_state[model_id].cache_folder = lora_config.cache_folder;
+    m->model_state[model_id].peft_model_id = lora_config.peft_model_id;
+  }
+  return m;
+}
+
+void LoraLinear::forward(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal init");
+}
+
+FutureMap
+    LoraLinear::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  IndexLauncher launcher(LORA_LINEAR_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void LoraLinear::inference_task(Task const *task,
+                                std::vector<PhysicalRegion> const &regions,
+                                Context ctx,
+                                Runtime *runtime) {
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 2);
+  assert(task->regions.size() == regions.size());
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  // int num_infr_tokens = bc->num_active_infr_tokens();
+  // int num_peft_tokens = bc->num_active_peft_tokens();
+  inference_kernel_wrapper(m, bc, input, output);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+
+    // get layer name
+    std::string lora_layername = std::string(m->op_name);
+    std::string searchString = "lora";
+    size_t found = lora_layername.find(searchString);
+    if (found == std::string::npos) {
+      std::cout << "LoraLinear layer name not in the right format (does not "
+                   "contain word 'lora')"
+                << std::endl;
+      assert(false);
+    }
+    std::string lora_layername_substr =
+        lora_layername.substr(0, found + searchString.length());
+    // print layer name
+    std::cout << "INF " << lora_layername_substr << std::endl;
+
+    // build output filepath
+    fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id);
+    if (m->layer_guid.model_id > 0) {
+      assert(false && "Model ID > 0 not supported yet");
+    }
+    std::string layername = "layers." +
+                            std::to_string(m->layer_guid.transformer_layer_id) +
+                            "." + lora_layername_substr;
+    dst_filepath /= layername;
+
+    // save batch config, if passed
+    if (bc != nullptr) {
+      bc->save_to_file(dst_filepath.string() + ".batch_config");
+    }
+
+    std::string filename = dst_filepath.string() + ".input_0";
+    if (input.data_type == DT_FLOAT) {
+      save_tensor(
+          input.get_float_ptr(), input.domain.get_volume(), filename.c_str());
+    } else if (input.data_type == DT_HALF) {
+      save_tensor(
+          input.get_half_ptr(), input.domain.get_volume(), filename.c_str());
+    } else {
+      assert(false);
+    }
+
+    int rank, num_tokens;
+    for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
+      PEFTModelID peft_model_id = it->first;
+      LoraLinearWeight weight = m->model_state[peft_model_id].weights;
+      rank = weight.rank;
+      num_tokens = input.domain.get_volume() / weight.in_dim;
+      fs::path dst_filepath_weights =
+          get_dst_folder("weights", m->decoding_step, shard_id) / layername;
+      std::string filenameA =
+          dst_filepath_weights.string() + ".weight_A.original";
+      std::string filenameB =
+          dst_filepath_weights.string() + ".weight_B.original";
+      if (m->input_type[0] == DT_FLOAT) {
+        save_tensor((float *)weight.w0_ptr,
+                    weight.rank * weight.in_dim,
+                    filenameA.c_str());
+        save_tensor((float *)weight.w1_ptr,
+                    weight.rank * weight.out_dim,
+                    filenameB.c_str());
+      } else if (m->input_type[0] == DT_HALF) {
+        save_tensor((half *)weight.w0_ptr,
+                    weight.rank * weight.in_dim,
+                    filenameA.c_str());
+        save_tensor((half *)weight.w1_ptr,
+                    weight.rank * weight.out_dim,
+                    filenameB.c_str());
+      } else {
+        assert(false && "Data type not supported");
+      }
+    }
+
+    filename = dst_filepath.string() + ".output_0";
+    if (output.data_type == DT_FLOAT) {
+      save_tensor(
+          output.get_float_ptr(), output.domain.get_volume(), filename.c_str());
+    } else if (output.data_type == DT_HALF) {
+      save_tensor(
+          output.get_half_ptr(), output.domain.get_volume(), filename.c_str());
+    } else {
+      assert(false);
+    }
+
+    if (bc->num_active_peft_tokens() > 0) {
+      // input activation (intermediate)
+      filename = dst_filepath.string() + ".low_rank_activation";
+      if (output.data_type == DT_FLOAT) {
+        save_tensor((float *)m->low_rank_activation,
+                    rank * num_tokens,
+                    filename.c_str());
+      } else if (output.data_type == DT_HALF) {
+        save_tensor((half *)m->low_rank_activation,
+                    rank * num_tokens,
+                    filename.c_str());
+      } else {
+        assert(false);
+      }
+    }
+    m->decoding_step++;
+  }
+}
+
+FutureMap LoraLinear::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void lora_inference_debugging(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorW input_grad,
+                              GenericTensorAccessorR output_grad,
+                              int shard_id) {
+  // get layer name
+  std::string lora_layername = std::string(m->op_name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+  // print layer name
+  std::cout << "BWD " << lora_layername_substr << std::endl;
+
+  // build output filepath
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + lora_layername_substr;
+  dst_filepath /= layername;
+
+  // save batch config, if passed
+  if (bc != nullptr) {
+    bc->save_to_file(dst_filepath.string() + ".batch_config");
+  }
+
+  // weights, weights gradients
+  fs::path dst_filepath_weights =
+      get_dst_folder("weights", m->bwd_step, shard_id) / layername;
+  assert(m->model_state.size() >= 1 && "Model state empty!");
+  for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
+    PEFTModelID peft_model_id = it->first;
+    LoraLinearWeight weight = m->model_state[peft_model_id].weights;
+    std::string filename_weight_A =
+        dst_filepath_weights.string() + ".weight_A.finetuned";
+    std::string filename_weight_B =
+        dst_filepath_weights.string() + ".weight_B.finetuned";
+    std::string filename_grad_A =
+        dst_filepath_weights.string() + ".weight_A.gradient";
+    std::string filename_grad_B =
+        dst_filepath_weights.string() + ".weight_B.gradient";
+    if (m->input_type[0] == DT_FLOAT) {
+      // weight A
+      save_tensor((float *)weight.w0_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_weight_A.c_str());
+      // weight grad A
+      save_tensor((float *)weight.w0_grad_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_grad_A.c_str());
+      // weight B
+      save_tensor((float *)weight.w1_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_weight_B.c_str());
+      // weight grad B
+      save_tensor((float *)weight.w1_grad_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_grad_B.c_str());
+    } else if (m->input_type[0] == DT_HALF) {
+      // weight A
+      save_tensor((half *)weight.w0_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_weight_A.c_str());
+      // weight grad A
+      save_tensor((half *)weight.w0_grad_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_grad_A.c_str());
+      // weight B
+      save_tensor((half *)weight.w1_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_weight_B.c_str());
+      // weight grad B
+      save_tensor((half *)weight.w1_grad_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_grad_B.c_str());
+    } else {
+      assert(false && "Data type not supported");
+    }
+  }
+
+  std::string filename = dst_filepath.string() + ".input_gradient_0";
+  if (input_grad.data_type == DT_FLOAT) {
+    save_tensor(input_grad.get_float_ptr(),
+                input_grad.domain.get_volume(),
+                filename.c_str());
+  } else if (input_grad.data_type == DT_HALF) {
+    save_tensor(input_grad.get_half_ptr(),
+                input_grad.domain.get_volume(),
+                filename.c_str());
+  } else {
+    assert(false);
+  }
+
+  filename = dst_filepath.string() + ".output_gradient_0";
+  if (output_grad.data_type == DT_FLOAT) {
+    save_tensor(output_grad.get_float_ptr(),
+                output_grad.domain.get_volume(),
+                filename.c_str());
+  } else if (output_grad.data_type == DT_HALF) {
+    save_tensor(output_grad.get_half_ptr(),
+                output_grad.domain.get_volume(),
+                filename.c_str());
+  } else {
+    assert(false);
+  }
+  m->bwd_step++;
+}
+
+template <typename DT>
+void save_peft_to_file(DT const *weight_ptr,
+                       size_t size,
+                       std::string filepath) {
+  std::ofstream out(filepath, std::ios::binary);
+  // Check if the file was opened successfully
+  if (!out || !out.is_open() || !out.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(out && out.is_open() && out.good() &&
+         "can't write to lora weight file path");
+  std::vector<DT> host_array(size);
+  copy_tensor_dev_to_host(weight_ptr, host_array.data(), size);
+
+  size_t target_data_size = sizeof(DT) * size;
+  out.write((char *)host_array.data(), target_data_size);
+
+  size_t out_written_size = out.tellp();
+  if (out_written_size != target_data_size) {
+    printf("save weight data error: %lu, %lu, %lu\n",
+           out_written_size,
+           target_data_size,
+           sizeof(DT));
+    assert(false);
+  }
+  out.close();
+}
+
+void save_peft_weights_if_needed(LoraLinearMeta *m,
+                                 BatchConfig const *bc,
+                                 int in_dim,
+                                 int out_dim,
+                                 int shard_id) {
+  std::string lora_layername = std::string(m->op_name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) {
+      assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+             m->model_state.end());
+      std::string weight_export_folder = join_path({
+          m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder,
+          "finetuned_models",
+          m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id,
+          "weights",
+          "shard_" + std::to_string(shard_id),
+      });
+      fs::create_directories(weight_export_folder);
+
+      int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank;
+      int w0_num_elements = rank * in_dim;
+      int w1_num_elements = rank * out_dim;
+      std::string w0_filepath = join_path(
+          {weight_export_folder, lora_layername_substr + "_A.weight"});
+      std::string w1_filepath = join_path(
+          {weight_export_folder, lora_layername_substr + "_B.weight"});
+      if (m->input_type[0] == DT_FLOAT) {
+        save_peft_to_file(
+            (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                .weights.w0_ptr,
+            w0_num_elements,
+            w0_filepath);
+        if (shard_id == 0) {
+          save_peft_to_file(
+              (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                  .weights.w1_ptr,
+              w1_num_elements,
+              w1_filepath);
+        }
+      } else if (m->input_type[0] == DT_HALF) {
+        save_peft_to_file(
+            (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                .weights.w0_ptr,
+            w0_num_elements,
+            w0_filepath);
+        if (shard_id == 0) {
+          save_peft_to_file(
+              (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                  .weights.w1_ptr,
+              w1_num_elements,
+              w1_filepath);
+        }
+      } else {
+        assert(false && "Data type not supported");
+      }
+    }
+  }
+}
+
+void LoraLinear::peft_bwd_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  Domain input_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 2);
+  assert(task->regions.size() == regions.size());
+  assert(m->input_type[0] == m->output_type[0]);
+  assert(task->index_point.get_dim() == 1);
+  int shard_id = task->index_point.point_data[0];
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  // int num_infr_tokens = bc->num_active_infr_tokens();
+  // int num_peft_tokens = bc->num_active_peft_tokens();
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+
+  save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id);
+
+  if (m->inference_debugging) {
+    lora_inference_debugging(m, bc, input_grad, output_grad, shard_id);
+  }
+}
+
+void LoraLinear::backward(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal backward");
+}
+
+void LoraLinear::print_layer(FFModel const &ff) {}
+
+void LoraLinear::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 1);
+  assert(numInputs == 2);
+  assert(outputs[0]->get_volume() == inputs[1]->get_volume());
+  outputs[0]->parallel_is = inputs[1]->parallel_is;
+  outputs[0]->region = inputs[1]->region;
+  outputs[0]->part = inputs[1]->part;
+  outputs[0]->region_grad = inputs[1]->region_grad;
+  outputs[0]->part_grad = inputs[1]->part_grad;
+}
+
+bool LoraLinear::measure_operator_cost(Simulator *sim,
+                                       MachineView const &mv,
+                                       CostMetrics &cost_metrics) const {
+  return false;
+}
+
+bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
+  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type &&
+      lhs.peft_configs.size() == rhs.peft_configs.size()) {
+    for (auto const &kv : lhs.peft_configs) {
+      auto it = rhs.peft_configs.find(kv.first);
+      if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+fs::path create_unique_temp_directory() {
+  std::srand(static_cast<unsigned int>(std::time(nullptr)));
+
+  fs::path temp_dir = fs::temp_directory_path();
+  fs::path unique_path;
+
+  do {
+    std::string unique_name = "flexflow_tmp_" + std::to_string(std::rand());
+    unique_path = temp_dir / unique_name;
+  } while (fs::exists(unique_path));
+
+  fs::create_directory(unique_path);
+  return unique_path;
+}
+
+void serialize_string(Legion::Serializer &sez,
+                      std::string string_to_serialize) {
+  sez.serialize(string_to_serialize.length());
+  sez.serialize(string_to_serialize.c_str(), string_to_serialize.length());
+}
+
+std::string deserialize_string(Legion::Deserializer &dez) {
+  size_t string_size;
+  char buffer[4096] = {0};
+  dez.deserialize(string_size);
+  dez.deserialize(buffer, string_size);
+  return std::string(buffer);
+}
+
+void LoraLinear::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->layer_guid.id);
+  sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->op_type);
+  sez.serialize(this->peft_configs.size());
+  for (auto const &kv : this->peft_configs) {
+    // Serialize PEFTModelID
+    sez.serialize(kv.first.id);
+
+    // Serialize LoraLinearConfig and OptimizerConfig to tmp folder
+    // 1. Create tmp dir and serialize it
+    fs::path unique_temp_dir = create_unique_temp_directory();
+    serialize_string(sez, unique_temp_dir.string());
+    // 2. Dump LoraLinearConfig to json file in tmp dir
+    std::string lora_config_filename = std::string("lora_linear_config_") +
+                                       std::to_string(kv.first.id) +
+                                       std::string(".json");
+    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
+    serialize_to_json_file(kv.second, lora_config_json_filepath);
+    // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type
+    std::string optimizer_filename = std::string("optimizer_config_") +
+                                     std::to_string(kv.first.id) +
+                                     std::string(".json");
+    fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
+    assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr));
+    if (kv.second.trainable) {
+      if (typeid(*kv.second.optimizer_config) ==
+          typeid(LoraSGDOptimizerConfig)) {
+        sez.serialize(OPTIMIZER_TYPE_SGD);
+        LoraSGDOptimizerConfig const *sgd_config =
+            static_cast<LoraSGDOptimizerConfig const *>(
+                kv.second.optimizer_config);
+        serialize_to_json_file(*sgd_config, optim_config_filepath);
+      } else if (typeid(*kv.second.optimizer_config) ==
+                 typeid(LoraAdamOptimizerConfig)) {
+        sez.serialize(OPTIMIZER_TYPE_ADAM);
+        LoraAdamOptimizerConfig const *adam_config =
+            static_cast<LoraAdamOptimizerConfig const *>(
+                kv.second.optimizer_config);
+        serialize_to_json_file(*adam_config, optim_config_filepath);
+      } else {
+        assert(false && "Optimizer type not yet supported");
+      }
+    }
+  }
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
+}
+
+/* static */
+using PCG::Node;
+Node LoraLinear::deserialize(FFModel &ff,
+                             Legion::Deserializer &dez,
+                             ParallelTensor inputs[],
+                             int num_inputs) {
+  assert(num_inputs == 2);
+  size_t id, transformer_layer_id, deserialized_model_id;
+  OperatorType op_type;
+  size_t num_pefts;
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+
+  LoraLinearParams params;
+
+  dez.deserialize(id);
+  dez.deserialize(transformer_layer_id);
+  dez.deserialize(deserialized_model_id);
+  dez.deserialize(op_type);
+  dez.deserialize(num_pefts);
+  for (int i = 0; i < num_pefts; i++) {
+    // Deserialize PEFTModelID
+    size_t pid;
+    dez.deserialize(pid);
+    PEFTModelID peft_model_id(pid);
+    // Deserialize tmp folder containing LoraLinearConfig and optimizer config
+    fs::path unique_temp_dir = fs::path(deserialize_string(dez));
+    // 1. Deserialize LoraLinearConfig
+    std::string lora_config_filename = std::string("lora_linear_config_") +
+                                       std::to_string(pid) +
+                                       std::string(".json");
+    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
+    std::unique_ptr<LoraLinearConfig> lora_linear_config =
+        deserialize_from_json_file<LoraLinearConfig>(lora_config_json_filepath);
+    // 2. Deserialize optimizer if needed
+    if (lora_linear_config->trainable) {
+      std::string optimizer_filename = std::string("optimizer_config_") +
+                                       std::to_string(pid) +
+                                       std::string(".json");
+      fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
+      OptimizerType type_;
+      dez.deserialize(type_);
+      if (type_ == OPTIMIZER_TYPE_SGD) {
+        std::unique_ptr<LoraSGDOptimizerConfig> sgd_optimizer_config =
+            deserialize_from_json_file<LoraSGDOptimizerConfig>(
+                optim_config_filepath);
+        lora_linear_config->optimizer_config =
+            dynamic_cast<LoraOptimizerConfig *>(sgd_optimizer_config.release());
+      } else if (type_ == OPTIMIZER_TYPE_ADAM) {
+        std::unique_ptr<LoraAdamOptimizerConfig> adam_optimizer_config =
+            deserialize_from_json_file<LoraAdamOptimizerConfig>(
+                optim_config_filepath);
+        lora_linear_config->optimizer_config =
+            dynamic_cast<LoraOptimizerConfig *>(
+                adam_optimizer_config.release());
+      } else {
+        printf("Optimizer type: %d\n", type_);
+        assert(false && "Optimizer type not yet supported");
+      }
+    }
+    try {
+      fs::remove_all(unique_temp_dir);
+    } catch (fs::filesystem_error const &e) {
+      std::cerr << "Error removing tmp directory: " << e.what() << std::endl;
+    }
+    params.peft_configs.emplace(
+        std::make_pair(peft_model_id, *lora_linear_config));
+  }
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
+  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
+
+  params.layer_guid = layer_guid;
+  params.type = op_type;
+  strcpy(params.name, name);
+  return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
+}
+
+Op *LoraLinear::materialize(FFModel &ff,
+                            ParallelTensor inputs[],
+                            int num_inputs) const {
+  LoraLinearParams params = get_params();
+  return new LoraLinear(ff, params, {inputs[0], inputs[1]}, this->name);
+}
+
+LoraLinearParams LoraLinear::get_params() const {
+  LoraLinearParams params;
+  params.layer_guid = this->layer_guid;
+  params.type = this->op_type;
+  if (strlen(this->name) < MAX_OPNAME) {
+    strcpy(params.name, this->name);
+  }
+  params.peft_configs = this->peft_configs;
+  return params;
+}
+
+bool LoraLinearParams::is_valid(
+    std::pair<ParallelTensorShape, ParallelTensorShape> const &input_shape)
+    const {
+  return true;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::LoraLinearParams>::operator()(
+    FlexFlow::LoraLinearParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.layer_guid.transformer_layer_id);
+  hash_combine(key, params.layer_guid.model_id);
+  for (auto const &kv : params.peft_configs) {
+    hash_combine(key, kv.first.id);
+    hash_combine(key, kv.second.rank);
+    hash_combine(key, kv.second.trainable);
+    hash_combine(key, kv.second.cache_folder);
+    hash_combine(key, kv.second.peft_model_id);
+    hash_combine(key, kv.second.lora_alpha);
+    hash_combine(key, kv.second.lora_dropout);
+    hash_combine(key, kv.second.target_modules);
+    hash_combine(key, kv.second.init_lora_weights);
+  }
+  return key;
+}
+}; // namespace std
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
new file mode 100644
index 0000000000..6e0c60e057
--- /dev/null
+++ b/src/ops/lora_linear_params.cc
@@ -0,0 +1,221 @@
+#include "flexflow/ops/lora_linear_params.h"
+#include <fstream>
+#include <nlohmann/json.hpp>
+#include <string>
+using json = nlohmann::json;
+
+namespace FlexFlow {
+
+// ---------------- Optimizer configs ----------------
+// ---------------------------------------------------
+
+// empty optimizer
+LoraOptimizerConfig::LoraOptimizerConfig() {}
+
+// SGD optimizer
+LoraSGDOptimizerConfig::LoraSGDOptimizerConfig()
+    : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {}
+
+LoraSGDOptimizerConfig::LoraSGDOptimizerConfig(double lr_,
+                                               double momentum_,
+                                               bool nesterov_,
+                                               bool weight_decay_)
+    : lr(lr_), momentum(momentum_), nesterov(nesterov_),
+      weight_decay(weight_decay_) {}
+
+std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) {
+  os << "SGD Optimizer (lr=" << llc.lr << ",momentum=" << llc.momentum
+     << ",nesterov=" << llc.nesterov << ",weight_decay=" << llc.weight_decay
+     << ")";
+  return os;
+}
+
+// Adam optimizer
+LoraAdamOptimizerConfig::LoraAdamOptimizerConfig()
+    : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f),
+      epsilon(1e-8) {}
+
+LoraAdamOptimizerConfig::LoraAdamOptimizerConfig(double alpha_,
+                                                 double beta1_,
+                                                 double beta2_,
+                                                 double weight_decay_,
+                                                 double epsilon_)
+    : alpha(alpha_), beta1(beta1_), beta2(beta2_), weight_decay(weight_decay_),
+      epsilon(epsilon_) {}
+
+std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) {
+  os << "SGD Optimizer (alpha=" << llc.alpha << ",beta1=" << llc.beta1
+     << ",beta2=" << llc.beta2 << ",weight_decay=" << llc.weight_decay
+     << ",epsilon=" << llc.epsilon << ")";
+  return os;
+}
+
+// Serialization helpers
+template <typename T>
+void serialize_to_json_file(T const &obj, fs::path const &filepath) {
+  json j = obj;
+  std::ofstream file(filepath);
+  file << j.dump(4);
+}
+
+template <typename T>
+std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath) {
+  std::ifstream file(filepath);
+  json j;
+  file >> j;
+  return std::make_unique<T>(j.get<T>());
+}
+
+template void
+    serialize_to_json_file<LoraLinearConfig>(LoraLinearConfig const &obj,
+                                             fs::path const &filepath);
+template void serialize_to_json_file<LoraSGDOptimizerConfig>(
+    LoraSGDOptimizerConfig const &obj, fs::path const &filepath);
+template void serialize_to_json_file<LoraAdamOptimizerConfig>(
+    LoraAdamOptimizerConfig const &obj, fs::path const &filepath);
+template std::unique_ptr<LoraLinearConfig>
+    deserialize_from_json_file<LoraLinearConfig>(fs::path const &filepath);
+template std::unique_ptr<LoraSGDOptimizerConfig>
+    deserialize_from_json_file<LoraSGDOptimizerConfig>(
+        fs::path const &filepath);
+template std::unique_ptr<LoraAdamOptimizerConfig>
+    deserialize_from_json_file<LoraAdamOptimizerConfig>(
+        fs::path const &filepath);
+
+// ------------------ LoRA configs -------------------
+// ---------------------------------------------------
+const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", "");
+
+LoraLinearConfig::LoraLinearConfig(
+    std::string const &cache_folder_,
+    std::string const &peft_model_id_,
+    bool trainable_,
+    LoraOptimizerConfig *optimizer_config_,
+    bool init_lora_weights_,
+    std::string const &base_model_name_or_path_,
+    std::string const &precision_,
+    int rank_,
+    float lora_alpha_,
+    float lora_dropout_,
+    std::vector<std::string> const &target_modules_)
+    : cache_folder(cache_folder_), peft_model_id(peft_model_id_), rank(rank_),
+      lora_alpha(lora_alpha_), lora_dropout(lora_dropout_),
+      trainable(trainable_), optimizer_config(optimizer_config_),
+      init_lora_weights(init_lora_weights_),
+      base_model_name_or_path(base_model_name_or_path_), precision(precision_),
+      target_modules(target_modules_) {
+
+  if (peft_model_id.empty()) {
+    return;
+  }
+  assert(!cache_folder.empty() &&
+         "cache_folder must be provided when using PEFT");
+  if (trainable) {
+    assert(optimizer_config != nullptr &&
+           "optimizer_config must be provided when using PEFT");
+    assert(
+        !base_model_name_or_path.empty() &&
+        "base_model_name_or_path must be provided when training a PEFT model");
+    assert(!precision.empty() &&
+           "precision must be provided when training a PEFT model");
+  } else {
+    assert(init_lora_weights == false &&
+           "init_lora_weights must be false when LORA not trainable");
+    assert(optimizer_config == nullptr &&
+           "optimizer_config must be nullptr when not trainable");
+  }
+  // if we are not initializing LORA from scratch, load the configs from
+  // existing repository
+  if (!init_lora_weights) {
+    std::string peft_inference_config_file_path =
+        join_path({cache_folder, "configs", peft_model_id, "config.json"});
+    std::ifstream config_file(peft_inference_config_file_path);
+    if (config_file.is_open()) {
+      try {
+        json model_config;
+        config_file >> model_config;
+        rank = model_config["r"];
+        lora_alpha = float(model_config["lora_alpha"]);
+        lora_dropout = model_config["lora_dropout"];
+        for (auto &s : model_config["target_modules"]) {
+          target_modules.push_back(s);
+        }
+        // do not load the base_model_name_or_path from the HF config because we
+        // may be applying LoRA to another model
+      } catch (json::exception const &e) {
+        std::cerr << "Error parsing PEFT config from JSON file: " << e.what()
+                  << std::endl;
+        assert(false);
+      }
+    } else {
+      std::cerr << "Error opening JSON file " << peft_inference_config_file_path
+                << std::endl;
+      assert(false);
+    }
+  }
+  assert(rank > 0 && "rank must be greater than 0");
+  assert(lora_alpha > 0.0f && "lora_alpha must be greater than 0.0");
+  assert(lora_dropout >= 0.0f && lora_dropout <= 1.0f &&
+         "lora_dropout must be in [0.0, 1.0]");
+  assert(target_modules.size() > 0 && "target_modules must not be left empty");
+}
+
+// constructor used to support unordered_map
+LoraLinearConfig::LoraLinearConfig() : LoraLinearConfig("", "") {}
+
+bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
+  if (lhs.cache_folder == rhs.cache_folder &&
+      lhs.peft_model_id == rhs.peft_model_id && lhs.rank == rhs.rank &&
+      lhs.lora_alpha == rhs.lora_alpha &&
+      lhs.lora_dropout == rhs.lora_dropout &&
+      lhs.target_modules.size() == rhs.target_modules.size() &&
+      lhs.trainable == rhs.trainable &&
+      lhs.init_lora_weights == rhs.init_lora_weights &&
+      lhs.optimizer_config == rhs.optimizer_config &&
+      lhs.base_model_name_or_path == rhs.base_model_name_or_path &&
+      lhs.precision == rhs.precision) {
+    for (int i = 0; i < lhs.target_modules.size(); i++) {
+      if (lhs.target_modules[i] != rhs.target_modules[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
+  os << "LoraLinearConfig: ";
+  os << "cache_folder: " << llc.cache_folder << ", ";
+  os << "peft_model_id: " << llc.peft_model_id << ", ";
+  os << "rank: " << llc.rank << ", ";
+  os << "lora_alpha: " << llc.lora_alpha << ", ";
+  os << "lora_dropout: " << llc.lora_dropout << ", ";
+  os << "target_modules: [";
+  for (int i = 0; i < llc.target_modules.size(); i++) {
+    os << llc.target_modules[i];
+    if (i < llc.target_modules.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << "], ";
+  os << "trainable: " << llc.trainable << ", ";
+  if (llc.optimizer_config != nullptr) {
+    os << "optimizer_config: ";
+    if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+      os << *static_cast<LoraSGDOptimizerConfig *>(llc.optimizer_config);
+    } else if (typeid(*llc.optimizer_config) ==
+               typeid(LoraAdamOptimizerConfig)) {
+      os << *static_cast<LoraAdamOptimizerConfig *>(llc.optimizer_config);
+    } else {
+      os << "Unknown optimizer config type";
+    }
+    std::cout << std::endl;
+  }
+  os << "init_lora_weights: " << llc.init_lora_weights << std::endl;
+  os << "base_model_name_or_path: " << llc.base_model_name_or_path << std::endl;
+  os << "precision: " << llc.precision << std::endl;
+  return os;
+}
+
+}; // namespace FlexFlow
diff --git a/src/ops/mean.cc b/src/ops/mean.cc
index b2ec94fdf8..0d41276735 100644
--- a/src/ops/mean.cc
+++ b/src/ops/mean.cc
@@ -87,8 +87,7 @@ OpMeta *Mean::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handler = *((FFHandler const *)task->local_args);
-  OpMeta *m = new OpMeta(handler);
-  return m;
+  return nullptr;
 }
 
 void Mean::forward(FFModel const &ff) {}
diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index da2d4922e3..45bd76d59d 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -90,8 +90,9 @@ OpMeta *NoOp::init_task(Task const *task,
                         std::vector<PhysicalRegion> const &regions,
                         Context ctx,
                         Runtime *runtime) {
+  NoOp *no_op = (NoOp *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  OpMeta *m = new OpMeta(handle);
+  OpMeta *m = new OpMeta(handle, no_op);
   return m;
 }
 
@@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff,
     set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
     IndexLauncher launcher(NOOP_INIT_TASK_ID,
                            parallel_is,
-                           TaskArgument(NULL, 0),
+                           TaskArgument(this, sizeof(NoOp)),
                            argmap,
                            Predicate::TRUE_PRED,
                            false /*must*/,
@@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) {
     set_argumentmap_for_init(ff, argmap);
     IndexLauncher launcher(NOOP_INIT_TASK_ID,
                            parallel_is,
-                           TaskArgument(NULL, 0),
+                           TaskArgument(this, sizeof(NoOp)),
                            argmap,
                            Predicate::TRUE_PRED,
                            false /*must*/,
diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc
index 4621ab5909..c8b194afa9 100644
--- a/src/ops/pool_2d.cc
+++ b/src/ops/pool_2d.cc
@@ -315,7 +315,7 @@ OpMeta *Pool2D::init_task(Task const *task,
   assert(task->regions.size() == 2);
   Pool2D const *pool = (Pool2D *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Pool2DMeta *m = new Pool2DMeta(handle);
+  Pool2DMeta *m = new Pool2DMeta(handle, pool);
   m->profiling = pool->profiling;
   m->inference_debugging = pool->inference_debugging;
   std::strcpy(m->op_name, pool->name);
@@ -545,7 +545,7 @@ bool Pool2D::measure_operator_cost(Simulator *sim,
   int output_n = sub_output.dims[3].size;
   int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
   int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
-  Pool2DMeta *m = sim->pool2d_meta;
+  Pool2DMeta *m = new Pool2DMeta(sim->handler, this);
 
   init_kernel(m,
               input_w,
diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc
index 454a35caf4..1c0566e9ca 100644
--- a/src/ops/reduce.cc
+++ b/src/ops/reduce.cc
@@ -41,7 +41,7 @@ ReduceParams Reduce::get_params() const {
   }
   params.keepdims = keepdims;
   params.layer_guid = this->layer_guid;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/reduce.cpp b/src/ops/reduce.cpp
index c062955ed6..fe122b13eb 100644
--- a/src/ops/reduce.cpp
+++ b/src/ops/reduce.cpp
@@ -25,7 +25,7 @@ using Legion::Domain;
 ReduceMeta::ReduceMeta(FFHandler handler,
                        Reduce const *rd,
                        Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, rd) {
   checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc));
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/reduce.cu b/src/ops/reduce.cu
index 65efd90e9b..1352787a12 100644
--- a/src/ops/reduce.cu
+++ b/src/ops/reduce.cu
@@ -24,7 +24,7 @@ using Legion::Domain;
 ReduceMeta::ReduceMeta(FFHandler handler,
                        Reduce const *rd,
                        Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, rd) {
   checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc
index 49f99e2cb5..4e7fd2eb96 100644
--- a/src/ops/reshape.cc
+++ b/src/ops/reshape.cc
@@ -180,7 +180,7 @@ OpMeta *Reshape::init_task(Task const *task,
                            Runtime *runtime) {
   Reshape const *reshape = (Reshape *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  ReshapeMeta *m = new ReshapeMeta(handle);
+  ReshapeMeta *m = new ReshapeMeta(handle, reshape);
   std::strcpy(m->op_name, reshape->name);
   m->layer_guid = reshape->layer_guid;
   m->data_type = reshape->outputs[0]->data_type;
@@ -296,7 +296,7 @@ ReshapeParams Reshape::get_params() const {
   ReshapeParams params;
   params.shape = shape_vec;
   params.layer_guid = this->layer_guid;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 8dd670eea3..2a30d12d6d 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -44,7 +44,8 @@ bool operator==(ResidualLayerNormParams const &lhs,
   return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes &&
          lhs.elementwise_affine == rhs.elementwise_affine &&
          lhs.use_bias == rhs.use_bias &&
-         lhs.use_two_residuals == rhs.use_two_residuals;
+         lhs.use_two_residuals == rhs.use_two_residuals &&
+         lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool ResidualLayerNormParams::is_valid(
@@ -63,7 +64,8 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const {
   params.eps = this->eps;
   params.use_bias = this->use_bias;
   params.use_two_residuals = this->use_two_residuals;
-  if (this->name != nullptr) {
+  params.inplace_residual = this->inplace_residual;
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -78,6 +80,7 @@ void FFModel::residual_layer_norm(const Tensor input,
                                   bool elementwise_affine,
                                   float eps,
                                   bool use_bias,
+                                  bool inplace_residual,
                                   DataType data_type,
                                   char const *name) {
   // In PyTorch, axes must be the sizes of the last axes.size() dimensions of
@@ -117,7 +120,6 @@ void FFModel::residual_layer_norm(const Tensor input,
   }
 
   int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0;
-  Layer *ln = nullptr;
   Tensor casted_input =
       (data_type != input->data_type)
           ? cast(input, data_type, "type cast for residual_layer_norm")
@@ -133,20 +135,20 @@ void FFModel::residual_layer_norm(const Tensor input,
             ? cast(residual2, data_type, "type cast for residual2_layer_norm")
             : residual2;
   }
-  ln = new Layer(this,
-                 OP_RESIDUAL_LAYERNORM,
-                 data_type,
-                 name,
-                 2 + use_two_residuals /*inputs*/,
-                 num_weights,
-                 2 /*outputs*/,
-                 casted_input,
-                 casted_residual1,
-                 casted_residual2);
+  Layer *ln = new Layer(this,
+                        OP_RESIDUAL_LAYERNORM,
+                        data_type,
+                        name,
+                        2 + use_two_residuals /*inputs*/,
+                        num_weights,
+                        2 /*outputs*/,
+                        casted_input,
+                        casted_residual1,
+                        casted_residual2);
   ln->outputs[0] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/);
   ln->outputs[1] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/);
   {
     int numdims = axes.size();
     int dims[numdims];
@@ -179,6 +181,7 @@ void FFModel::residual_layer_norm(const Tensor input,
   ln->add_int_vector_property("axes", axes);
   ln->add_float_property("eps", eps);
   ln->add_int_property("use_two_residuals", use_two_residuals);
+  ln->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(ln);
   outputs[0] = ln->outputs[0];
   outputs[1] = ln->outputs[1];
@@ -199,6 +202,9 @@ Op *ResidualLayerNorm::create_operator_from_layer(
   layer->get_float_property("eps", eps);
   layer->get_int_property("use_two_residuals", value);
   bool use_two_residuals = (bool)value;
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
+
   return new ResidualLayerNorm(model,
                                layer->layer_guid,
                                inputs[0],
@@ -209,6 +215,7 @@ Op *ResidualLayerNorm::create_operator_from_layer(
                                elementwise_affine,
                                use_bias,
                                eps,
+                               inplace_residual,
                                false, // allocate_weights
                                layer->name);
 }
@@ -230,6 +237,7 @@ ResidualLayerNorm::ResidualLayerNorm(
                         params.elementwise_affine,
                         params.use_bias,
                         params.eps,
+                        params.inplace_residual,
                         allocate_weights,
                         params.name) {}
 
@@ -243,6 +251,7 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
                                      bool _elementwise_affine,
                                      bool _use_bias,
                                      float _eps,
+                                     bool _inplace_residual,
                                      bool allocate_weights,
                                      char const *name)
     : Op(model,
@@ -256,7 +265,8 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
          _residual1,
          _use_two_residuals ? _residual2 : nullptr),
       elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes),
-      use_bias(_use_bias), use_two_residuals(_use_two_residuals) {
+      use_bias(_use_bias), use_two_residuals(_use_two_residuals),
+      inplace_residual(_inplace_residual) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -326,6 +336,22 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
   }
 }
 
+void ResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
+}
+
 void ResidualLayerNorm::init_inference(
     FFModel const &ff,
     std::vector<ParallelTensor> const &batch_inputs,
@@ -347,13 +373,19 @@ void ResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   int field_id = 0;
   // input
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  // added: input + residual(s)
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
@@ -371,13 +403,15 @@ void ResidualLayerNorm::init_inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -422,13 +456,17 @@ void ResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  assert(outputs[0]->part == inputs[0]->part);
+  assert(outputs[0]->region == inputs[0]->region);
   int field_id = 0;
   // input
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
+  // added: input + residual(s)
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
@@ -439,20 +477,21 @@ void ResidualLayerNorm::init(FFModel const &ff) {
   launcher.add_field(field_id++, FID_DATA);
   // residual2
   if (use_two_residuals) {
-    launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
-                                                      inputs[1]->region));
+                                                      inputs[2]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
@@ -516,7 +555,323 @@ void ResidualLayerNorm::forward(FFModel const &ff) {
 }
 
 void ResidualLayerNorm::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // added output
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (use_two_residuals) {
+    // residual grad 2
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      inputs[2]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+    // gamma_grad
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+    if (use_bias) {
+      // beta_grad
+      launcher.add_region_requirement(
+          RegionRequirement(weights[1]->part_grad,
+                            0 /*projection id*/,
+                            READ_WRITE,
+                            EXCLUSIVE,
+                            weights[1]->region_grad));
+      launcher.add_field(field_id++, FID_DATA);
+    }
+  }
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void ResidualLayerNorm::backward_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  ResidualLayerNormMeta const *m =
+      *((ResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         4 + m->use_two_residuals +
+             (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR added_output =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual2_grad;
+  if (m->use_two_residuals) {
+    residual2_grad =
+        helperGetGenericTensorAccessorRW(m->input_type[2],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad =
+        helperGetGenericTensorAccessorRW(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+    if (m->use_bias) {
+      beta_grad =
+          helperGetGenericTensorAccessorRW(m->output_type[0],
+                                           regions[region_idx++],
+                                           task->regions[task_region_idx++],
+                                           FID_DATA,
+                                           ctx,
+                                           runtime);
+    }
+  }
+  ResidualLayerNorm::backward_kernel_wrapper(m,
+                                             output_grad,
+                                             added_output,
+                                             input_grad,
+                                             residual1_grad,
+                                             residual2_grad,
+                                             gamma,
+                                             gamma_grad,
+                                             beta_grad);
+}
+
+Legion::FutureMap ResidualLayerNorm::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad 1
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (use_two_residuals) {
+    // residual grad 2
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[2]->part_grad,
+                          0 /*projection id*/,
+                          reset_input_grads[2] ? WRITE_ONLY : READ_WRITE,
+                          EXCLUSIVE,
+                          batch_inputs[2]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void ResidualLayerNorm::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(task->regions.size() == regions.size());
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine);
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual2_grad;
+  if (m->use_two_residuals) {
+    GenericTensorAccessorW residual2_grad =
+        helperGetGenericTensorAccessorRW(m->input_type[2],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  GenericTensorAccessorR gamma;
+  if (m->elementwise_affine) {
+    gamma = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+  }
+  ResidualLayerNorm::peft_bwd_kernel_wrapper(
+      m, output_grad, input_grad, residual1_grad, residual2_grad, gamma);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    std::vector<GenericTensorAccessorR> input_accessors;
+    input_accessors.push_back(input_grad);
+    input_accessors.push_back(residual1_grad);
+    if (m->use_two_residuals) {
+      input_accessors.push_back(residual2_grad);
+    }
+    std::vector<GenericTensorAccessorR> weights_accessors;
+    if (m->elementwise_affine) {
+      weights_accessors.push_back(gamma);
+    }
+    ResidualLayerNorm::save_inference_tensors_to_file(m,
+                                                      shard_id,
+                                                      bc,
+                                                      input_accessors,
+                                                      weights_accessors,
+                                                      {output_grad},
+                                                      false);
+  }
 }
 
 Op *ResidualLayerNorm::materialize(FFModel &ff,
@@ -554,13 +909,19 @@ FutureMap ResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   int field_id = 0;
   // input
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  // added: input + residual(s)
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
@@ -578,13 +939,15 @@ FutureMap ResidualLayerNorm::inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -620,14 +983,13 @@ void ResidualLayerNorm::inference_task(
 
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
   if (bc->num_tokens == 0) {
     return;
   }
 
-  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-
   assert(regions.size() ==
-         4 + m->use_two_residuals +
+         3 + m->use_two_residuals +
              (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
 
   int region_idx = 0, task_region_idx = 0;
@@ -655,13 +1017,23 @@ void ResidualLayerNorm::inference_task(
                                          ctx,
                                          runtime);
   }
-  GenericTensorAccessorW added_output =
-      helperGetGenericTensorAccessorWO(m->output_type[0],
-                                       regions[region_idx++],
-                                       task->regions[task_region_idx++],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
+  GenericTensorAccessorW added_output;
+  if (m->inplace_residual) {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[0],
+                                                    task->regions[0],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  } else {
+    added_output =
+        helperGetGenericTensorAccessorWO(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(m->output_type[1],
                                        regions[region_idx++],
@@ -699,8 +1071,14 @@ void ResidualLayerNorm::inference_task(
     assert(in_domain.get_volume() == residual2_domain.get_volume());
     assert(residual2_domain == in_domain);
   }
-  Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[task_region_idx++].region.get_index_space());
+  Domain added_out_domain;
+  if (m->inplace_residual) {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+  } else {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[task_region_idx++].region.get_index_space());
+  }
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[task_region_idx++].region.get_index_space());
   Domain gamma_domain, beta_domain;
@@ -734,13 +1112,13 @@ void ResidualLayerNorm::inference_task(
          m->effective_num_elements * m->effective_batch_size);
 
   ResidualLayerNorm::inference_kernel_wrapper(
-      m, input, residual1, residual2, added_output, output, gamma, beta);
+      m, bc, input, residual1, residual2, added_output, output, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     std::vector<GenericTensorAccessorR> input_accessors;
-    input_accessors.push_back(input);
+    // input_accessors.push_back(input);
     input_accessors.push_back(residual1);
     if (m->use_two_residuals) {
       input_accessors.push_back(residual2);
@@ -779,6 +1157,7 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
   sez.serialize(this->use_two_residuals);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -794,6 +1173,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   bool elementwise_affine;
   bool use_bias;
   bool use_two_residuals;
+  bool inplace_residual;
   float eps;
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
@@ -810,6 +1190,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(eps);
   dez.deserialize(use_bias);
   dez.deserialize(use_two_residuals);
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -827,6 +1208,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   params.eps = eps;
   params.use_bias = use_bias;
   params.use_two_residuals = use_two_residuals;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   if (use_two_residuals) {
     return ff.get_or_create_node<ResidualLayerNorm>(
@@ -853,6 +1235,7 @@ size_t hash<FlexFlow::ResidualLayerNormParams>::operator()(
   hash_combine(key, params.elementwise_affine);
   hash_combine(key, params.use_bias);
   hash_combine(key, params.use_two_residuals);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index f1b7a537b0..582e0752ef 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -23,11 +23,12 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
                                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   use_two_residuals = ln->use_two_residuals;
@@ -36,6 +37,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
   eps = ln->eps;
+  inplace_residual = ln->inplace_residual;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
@@ -45,6 +47,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 ResidualLayerNormMeta::~ResidualLayerNormMeta(void) {
@@ -75,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -84,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -110,8 +111,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T residual2_val = (residual2_ptr == nullptr)
                                 ? T(0)
@@ -120,12 +120,10 @@ __global__ void ResidualLayerNormKernel(int64_t N,
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -137,7 +135,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -161,19 +159,9 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                                          T const *beta_ptr,
                                          hipStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualLayerNormKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->effective_batch_size,
+                     std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
                      0,
                      stream,
                      m->effective_num_elements,
@@ -188,10 +176,41 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                      beta_ptr,
                      output_ptr);
 }
+template <typename T>
+void save_inference_tensors(ResidualLayerNormMeta const *m) {
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+}
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
-    ResidualLayerNormMeta const *m,
+    ResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &residual1,
     GenericTensorAccessorR const &residual2,
@@ -208,12 +227,13 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
+
   if (m->input_type[0] == DT_FLOAT) {
     ResidualLayerNorm::inference_kernel<float>(
         m,
         input.get_float_ptr(),
         residual1.get_float_ptr(),
-        residual2.get_float_ptr(),
+        m->use_two_residuals ? residual2.get_float_ptr() : nullptr,
         added_output.get_float_ptr(),
         output.get_float_ptr(),
         m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
@@ -224,7 +244,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         m,
         input.get_half_ptr(),
         residual1.get_half_ptr(),
-        residual2.get_half_ptr(),
+        m->use_two_residuals ? residual2.get_half_ptr() : nullptr,
         added_output.get_half_ptr(),
         output.get_half_ptr(),
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
@@ -234,6 +254,76 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     assert(false && "unsupport datatype in layernorm");
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->inference_debugging) {
+    if (m->input_type[0] == DT_FLOAT) {
+      save_inference_tensors<float>(m);
+    } else if (m->input_type[0] == DT_HALF) {
+      save_inference_tensors<half>(m);
+    } else {
+      assert(false && "unsupport datatype in layernorm");
+    }
+  }
+
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -245,4 +335,551 @@ void ResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad1,
+                                      bool reset_residual_grad2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad1) {
+      dX_residual1_i[l] = f_grad_input;
+    } else {
+      dX_residual1_i[l] += f_grad_input;
+    }
+    if (dX_residual2 != nullptr) {
+      if (reset_residual_grad2) {
+        dX_residual2_i[l] = f_grad_input;
+      } else {
+        dX_residual2_i[l] += f_grad_input;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad1,
+                                             bool reset_residual_grad2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual1,
+             dX_residual2,
+             reset_input_grad,
+             reset_residual_grad1,
+             reset_residual_grad2,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void backward_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *added_output_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     T *gamma_grad_ptr,
+                     T *beta_grad_ptr,
+                     hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     gamma_ptr,
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
+                     B,
+                     kCUDANumThreads,
+                     0,
+                     stream,
+                     M,
+                     N,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual1_grad_ptr,
+                     residual2_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     m->reset_input_grads[2],
+                     N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel<T>),
+                         B,
+                         kCUDANumThreads,
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel<T>),
+                         B,
+                         dim3(kThreadX, kThreadY),
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void ResidualLayerNorm::backward_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &added_output,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual1_grad.get_float_ptr(),
+        m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual1_grad.get_half_ptr(),
+        m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void peft_bwd_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     static_cast<T const *>(m->input_activation),
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual1_grad_ptr,
+                     residual2_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     m->reset_input_grads[2],
+                     N);
+}
+
+/*static*/
+void ResidualLayerNorm::peft_bwd_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual1_grad.get_float_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_float_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual1_grad.get_half_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_half_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index e5ebdce6ed..8cdf87a92c 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -22,11 +22,12 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
                                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   use_two_residuals = ln->use_two_residuals;
@@ -35,6 +36,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
   eps = ln->eps;
+  inplace_residual = ln->inplace_residual;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
@@ -44,6 +46,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 ResidualLayerNormMeta::~ResidualLayerNormMeta(void) {
@@ -74,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -83,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -109,8 +110,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T residual2_val = (residual2_ptr == nullptr)
                                 ? T(0)
@@ -119,12 +119,10 @@ __global__ void ResidualLayerNormKernel(int64_t N,
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -136,7 +134,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -160,33 +158,57 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                                          T const *beta_ptr,
                                          cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   ResidualLayerNormKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               m->eps,
-                                               input_ptr,
-                                               residual1_ptr,
-                                               residual2_ptr,
-                                               added_output_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               output_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   input_ptr,
+                   residual1_ptr,
+                   residual2_ptr,
+                   added_output_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   output_ptr);
+}
+template <typename T>
+void save_inference_tensors(ResidualLayerNormMeta const *m) {
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
 }
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
-    ResidualLayerNormMeta const *m,
+    ResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &residual1,
     GenericTensorAccessorR const &residual2,
@@ -203,6 +225,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+
   if (m->input_type[0] == DT_FLOAT) {
     ResidualLayerNorm::inference_kernel<float>(
         m,
@@ -229,6 +252,76 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     assert(false && "unsupport datatype in layernorm");
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->inference_debugging) {
+    if (m->input_type[0] == DT_FLOAT) {
+      save_inference_tensors<float>(m);
+    } else if (m->input_type[0] == DT_HALF) {
+      save_inference_tensors<half>(m);
+    } else {
+      assert(false && "unsupport datatype in layernorm");
+    }
+  }
+
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -240,4 +333,529 @@ void ResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad1,
+                                      bool reset_residual_grad2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad1) {
+      dX_residual1_i[l] = f_grad_input;
+    } else {
+      dX_residual1_i[l] += f_grad_input;
+    }
+    if (dX_residual2 != nullptr) {
+      if (reset_residual_grad2) {
+        dX_residual2_i[l] = f_grad_input;
+      } else {
+        dX_residual2_i[l] += f_grad_input;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad1,
+                                             bool reset_residual_grad2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual1,
+             dX_residual2,
+             reset_input_grad,
+             reset_residual_grad1,
+             reset_residual_grad2,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void backward_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *added_output_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     T *gamma_grad_ptr,
+                     T *beta_grad_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          added_output_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      added_output_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual1_grad_ptr,
+      residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
+      N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              added_output_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              added_output_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void ResidualLayerNorm::backward_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &added_output,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual1_grad.get_float_ptr(),
+        m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual1_grad.get_half_ptr(),
+        m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void peft_bwd_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T const *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual1_grad_ptr,
+      residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
+      N);
+}
+
+/*static*/
+void ResidualLayerNorm::peft_bwd_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual1_grad.get_float_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_float_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual1_grad.get_half_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_half_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index b3ee7179d0..744902f908 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -42,7 +42,8 @@ using namespace FlexFlow::Kernels::ResidualRMSNorm;
 
 bool operator==(ResidualRMSNormParams const &lhs,
                 ResidualRMSNormParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps;
+  return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps &&
+         lhs.dim == rhs.dim && lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool ResidualRMSNormParams::is_valid(
@@ -55,7 +56,8 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
-  if (this->name != nullptr) {
+  params.inplace_residual = this->inplace_residual;
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -66,6 +68,7 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                 Tensor *outputs,
                                 float eps,
                                 int dim,
+                                bool inplace_residual,
                                 DataType data_type,
                                 char const *name) {
   if (data_type == DT_NONE) {
@@ -90,9 +93,9 @@ void FFModel::residual_rms_norm(const Tensor input1,
                         casted_input2);
 
   rm->outputs[0] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/);
   rm->outputs[1] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/);
 
   // weights
   int weight_dims[1] = {dim};
@@ -100,12 +103,13 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                                  weight_dims,
                                                  data_type,
                                                  rm,
-                                                 true /*create_grad*/,
+                                                 false /*create_grad*/,
                                                  nullptr,
                                                  CHOSEN_SYNC_TYPE);
 
   rm->add_float_property("eps", eps);
   rm->add_int_property("dim", dim);
+  rm->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(rm);
   outputs[0] = rm->outputs[0];
   outputs[1] = rm->outputs[1];
@@ -120,6 +124,8 @@ Op *ResidualRMSNorm::create_operator_from_layer(
   long long value;
   layer->get_int_property("dim", value);
   int dim = value;
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
 
   return new ResidualRMSNorm(model,
                              layer->layer_guid,
@@ -127,6 +133,7 @@ Op *ResidualRMSNorm::create_operator_from_layer(
                              inputs[1],
                              eps,
                              dim,
+                             inplace_residual,
                              false,
                              layer->name);
 }
@@ -143,6 +150,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       inputs.second,
                       params.eps,
                       params.dim,
+                      params.inplace_residual,
                       allocate_weights,
                       params.name) {}
 
@@ -157,6 +165,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       inputs.second,
                       other.eps,
                       other.dim,
+                      other.inplace_residual,
                       allocate_weights,
                       other.name) {}
 ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
@@ -165,6 +174,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
                                  const ParallelTensor _input2,
                                  float _eps,
                                  int dim,
+                                 bool _inplace_residual,
                                  bool allocate_weights,
                                  char const *name)
     : Op(model,
@@ -177,6 +187,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
          _input1,
          _input2) {
   eps = _eps;
+  inplace_residual = _inplace_residual;
   inputs[0] = _input1;
   inputs[1] = _input2;
   layer_guid = _layer_guid;
@@ -234,6 +245,22 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
   }
 }
 
+void ResidualRMSNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
+}
+
 void ResidualRMSNorm::init(FFModel const &ff) {
   assert(check_output_input_weight_same_parallel_is());
   parallel_is = outputs[0]->parallel_is;
@@ -249,36 +276,44 @@ void ResidualRMSNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(outputs[0]->part == inputs[0]->part);
+    assert(outputs[0]->region == inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -306,36 +341,45 @@ void ResidualRMSNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -383,73 +427,131 @@ FutureMap
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_WRITE,
+                                                    READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
-  regions[0](I): input1
+  regions[0](I/O): input1 / residual output
   regions[1](I): input2
-  regions[2](O): residual output
-  regions[3](O): output
-  regions[4](I/O): weight
+  regions[2](O): output
+  regions[3](I): weight
 */
 void ResidualRMSNorm::inference_task(Task const *task,
                                      std::vector<PhysicalRegion> const &regions,
                                      Context ctx,
                                      Runtime *runtime) {
-  assert(task->regions.size() == 5);
-  assert(regions.size() == 5);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
   }
   ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
+  assert(task->regions.size() == 5 - m->inplace_residual);
+  assert(regions.size() == 5 - m->inplace_residual);
   GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output);
+
+  GenericTensorAccessorW residual_output, output;
+  GenericTensorAccessorR weight;
+  if (m->inplace_residual) {
+    // residual_output is mapped to the same region as the input
+    residual_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                       regions[0],
+                                                       task->regions[0],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    output = helperGetGenericTensorAccessorWO(m->output_type[1],
+                                              regions[2],
+                                              task->regions[2],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    weight = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+  } else {
+    residual_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                       regions[2],
+                                                       task->regions[2],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    output = helperGetGenericTensorAccessorWO(m->output_type[1],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    weight = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                              regions[4],
+                                              task->regions[4],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+  }
+
+  inference_kernel_wrapper(
+      m, bc, input1, input2, weight, residual_output, output);
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    ResidualRMSNorm::save_inference_tensors_to_file(
-        m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output});
+    if (m->inplace_residual) {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m, shard_id, bc, {input2}, {weight}, {residual_output, output});
+    } else {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input1, input2},
+          {weight},
+          {residual_output, output});
+    }
   }
 }
 
@@ -459,6 +561,7 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->eps);
   sez.serialize(this->dim);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -479,6 +582,8 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   dez.deserialize(eps);
   dez.deserialize(dim);
+  int inplace_residual;
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -487,13 +592,285 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   params.layer_guid = layer_guid;
   params.eps = eps;
   params.dim = dim;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   return ff.get_or_create_node<ResidualRMSNorm>({inputs[0], inputs[1]}, params);
 }
 
 void ResidualRMSNorm::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RESIDUAL_RMSNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): RMS output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I): residual output / RMS input
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I/O): residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I/O): residual input grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(4, FID_DATA);
+  // regions[5](I/O): gamma_grad
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(5, FID_DATA);
+
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
+*/
+void ResidualRMSNorm::backward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  assert(task->regions.size() == 6);
+  assert(regions.size() == 6);
+  ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW residual_output_rms_input =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_input0_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[2],
+                                       task->regions[2],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_input1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[3],
+                                       task->regions[3],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(m,
+                          output_grad,
+                          residual_output_rms_input,
+                          residual_input0_grad,
+                          residual_input1_grad,
+                          weight,
+                          weight_grad);
 }
+
+Legion::FutureMap
+    ResidualRMSNorm::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int fid = 0;
+  // residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(
+      batch_inputs[0]->part_grad,
+      0 /*projection id*/,
+      inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY,
+      EXCLUSIVE,
+      batch_inputs[0]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  // residual input grad 1
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual && !reset_input_grads[0]) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(fid++, FID_DATA);
+  }
+  // RMS output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  // gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(fid++, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I/O): Residual input 0 grad
+  regions[2](I/O): Residual input 1 grad
+  regions[3](I): weight
+*/
+void ResidualRMSNorm::peft_bwd_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
+  int expected_regions =
+      (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5;
+  assert(task->regions.size() == expected_regions);
+  assert(regions.size() == expected_regions);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  int rid = 0, t_rid = 0;
+  GenericTensorAccessorW input_grad_0 =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad_1 =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  GenericTensorAccessorR output_grad_0;
+  if (!m->reset_input_grads[0]) {
+    if (m->inplace_residual) {
+      // mapped to input 0
+      output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                                       regions[0],
+                                                       task->regions[0],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    } else {
+      output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                                       regions[rid++],
+                                                       task->regions[t_rid++],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    }
+  }
+  GenericTensorAccessorR output_grad_1 =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight =
+      helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  peft_bwd_kernel_wrapper(
+      m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    if (!m->reset_input_grads[0]) {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input_grad_0, input_grad_1},
+          {weight},
+          {output_grad_0, output_grad_1},
+          false);
+    } else {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input_grad_0, input_grad_1},
+          {weight},
+          {output_grad_1},
+          false);
+    }
+  }
+}
+
 Op *ResidualRMSNorm::materialize(FFModel &ff,
                                  ParallelTensor inputs[],
                                  int num_inputs) const {
@@ -516,6 +893,7 @@ size_t hash<FlexFlow::ResidualRMSNormParams>::operator()(
   hash_combine(key, params.eps);
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.dim);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 79dce65c57..8dadd7dcc3 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -53,7 +53,7 @@ RMSNormParams RMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -422,7 +422,7 @@ void RMSNorm::inference_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input, weight, output);
+  inference_kernel_wrapper(m, bc, input, weight, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -431,6 +431,166 @@ void RMSNorm::inference_task(Task const *task,
   }
 }
 
+void RMSNorm::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RMSNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I): input
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I/O): input_grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I/O): gamma_grad
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(4, FID_DATA);
+
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I): input
+  regions[2](I/O): input_grad
+  regions[3](I): weight
+  regions[4](I/O): weight_grad
+*/
+void RMSNorm::backward_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == 5);
+  assert(regions.size() == 5);
+  RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(
+      m, output_grad, input, input_grad, weight, weight_grad);
+}
+
+Legion::FutureMap
+    RMSNorm::peft_bwd(FFModel const &ff,
+                      BatchConfigFuture const &bc,
+                      std::vector<ParallelTensor> const &batch_inputs,
+                      std::vector<ParallelTensor> const &batch_outputs,
+                      MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I/O): input_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I): weight
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(2, FID_DATA);
+
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I/O): input_grad
+  regions[2](I): weight
+*/
+void RMSNorm::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == 3);
+  assert(regions.size() == 3);
+  RMSNormMeta *m = *((RMSNormMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    RMSNorm::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
+}
+
 void RMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
@@ -474,11 +634,9 @@ Op *RMSNorm::materialize(FFModel &ff,
                          ParallelTensor inputs[],
                          int num_inputs) const {
   RMSNormParams params = get_params();
-  return new RMSNorm(ff, params, inputs[0], true, this->name);
+  return new RMSNorm(ff, params, inputs[0], true, params.name);
 }
 
-void RMSNorm::backward(FFModel const &ff) {}
-
 bool RMSNorm::measure_operator_cost(Simulator *sim,
                                     MachineView const &mv,
                                     CostMetrics &cost_metrics) const {
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index b38c68843b..0358a2cd31 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -88,7 +88,7 @@ Op *Sampling::create_operator_from_layer(
 SamplingParams Sampling::get_params() const {
   SamplingParams params;
   params.top_p = this->top_p;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -302,7 +302,7 @@ InferenceResult
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   Sampling::forward_kernel_wrapper(m, input, indices, batch_size);
 
   if (m->inference_debugging) {
@@ -313,7 +313,7 @@ InferenceResult
   }
 
   InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3d1c8d9094..e7c2fea19c 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -52,7 +52,7 @@ bool SigmoidSiluMultiParams::is_valid(
 SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
   SigmoidSiluMultiParams params;
   params.layer_guid = this->layer_guid;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -254,7 +254,188 @@ void SigmoidSiluMulti::forward(FFModel const &ff) {
 }
 
 void SigmoidSiluMulti::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(SIGMOID_SILU_MULTI_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // output grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // input 1
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // input 2
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  // input 1 grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(3, FID_DATA);
+  // input 2 grad
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(4, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output grad
+  regions[1](I): input 1
+  regions[2](I): input 2
+  regions[3](I/O): input 1 grad
+  regions[4](I/O): input 2 grad
+*/
+void SigmoidSiluMulti::backward_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 5);
+
+  SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+
+  SigmoidSiluMulti::backward_kernel_wrapper(
+      m, output_grad, input1, input2, input1_grad, input2_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    SigmoidSiluMulti::save_inference_tensors_to_file(
+        m,
+        shard_id,
+        nullptr,
+        {output_grad, input1, input2},
+        {},
+        {input1_grad, input2_grad});
+  }
+}
+
+FutureMap
+    SigmoidSiluMulti::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // output grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // input 1 grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  // input 2 grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output grad
+  regions[3](I/O): input 1 grad
+  regions[4](I/O): input 2 grad
+*/
+void SigmoidSiluMulti::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 3);
+
+  SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+
+  SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+      m, bc, output_grad, input1_grad, input2_grad);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    SigmoidSiluMulti::save_inference_tensors_to_file(m,
+                                                     shard_id,
+                                                     nullptr,
+                                                     {input1_grad, input2_grad},
+                                                     {},
+                                                     {output_grad},
+                                                     false);
+  }
 }
 
 FutureMap SigmoidSiluMulti::inference(
@@ -347,7 +528,7 @@ void SigmoidSiluMulti::inference_task(
   assert(input1_domain == input2_domain);
   assert(input1_domain == output_domain);
 
-  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output);
+  SigmoidSiluMulti::inference_kernel_wrapper(m, bc, input1, input2, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 7b7f30a288..ceaa1a7788 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -23,7 +23,7 @@ namespace FlexFlow {
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
                                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
 }
@@ -34,36 +34,56 @@ SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) {
   }
 }
 
-__device__ __forceinline__ float sigmoid_float(float x) {
-  return 1.0 / (1.0 + expf(-x));
-}
-
-__device__ __forceinline__ half sigmoid_half(half x) {
-  return (half)1.0 / ((half)1.0 + hexp(-x));
-}
-
-__global__ void SigmoidSiluMultiKernelFloat(int num_elements,
-                                            float const *input1_ptr,
-                                            float const *input2_ptr,
-                                            float *output_ptr) {
+template <typename T>
+__global__ void SigmoidSiluMultiKernel(int num_elements,
+                                       T const *input1_ptr,
+                                       T const *input2_ptr,
+                                       T *output_ptr) {
   CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] =
-        input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i];
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+    output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i];
   }
 }
 
-__global__ void SigmoidSiluMultiKernelHalf(int num_elements,
-                                           half const *input1_ptr,
-                                           half const *input2_ptr,
-                                           half *output_ptr) {
+template <typename T>
+__global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
+                                               T const *output_grad_ptr,
+                                               T const *input1_ptr,
+                                               T const *input2_ptr,
+                                               T *input1_grad_ptr,
+                                               T *input2_grad_ptr,
+                                               bool reset_input_grad1,
+                                               bool reset_input_grad2) {
   CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i];
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+
+    if (reset_input_grad2) {
+      input2_grad_ptr[i] =
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    } else {
+      input2_grad_ptr[i] +=
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    }
+    T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
+    if (reset_input_grad1) {
+      input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
+    } else {
+      input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    }
+    T sig_grad = ss_grad_val * input1_ptr[i];
+
+    float x1_grad_val = static_cast<float>(sig_grad);
+    x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val);
+    input1_grad_ptr[i] += T(x1_grad_val);
   }
 }
 
 /*static*/
 void SigmoidSiluMulti::inference_kernel_wrapper(
-    SigmoidSiluMultiMeta const *m,
+    SigmoidSiluMultiMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
     GenericTensorAccessorW const &output) {
@@ -81,8 +101,84 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t input_tensor_size =
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
+        size_t activation_size_needed =
+            2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(m->input_activation,
+                                   input1.get_float_ptr() +
+                                       tokens_previous_requests * in_dim,
+                                   input_tensor_size,
+                                   hipMemcpyDeviceToDevice,
+                                   stream));
+          checkCUDA(hipMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_float_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(m->input_activation,
+                                   input1.get_half_ptr() +
+                                       tokens_previous_requests * in_dim,
+                                   input_tensor_size,
+                                   hipMemcpyDeviceToDevice,
+                                   stream));
+          checkCUDA(hipMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_half_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel),
                        GET_BLOCKS(num_elements),
                        min(CUDA_NUM_THREADS, num_elements),
                        0,
@@ -92,7 +188,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                        input2.get_float_ptr(),
                        output.get_float_ptr());
   } else if (m->input_type[0] == DT_HALF) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel),
                        GET_BLOCKS(num_elements),
                        min(CUDA_NUM_THREADS, num_elements),
                        0,
@@ -116,4 +212,159 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void SigmoidSiluMulti::backward_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input1,
+    GenericTensorAccessorR const &input2,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_float_ptr(),
+                       input1.get_float_ptr(),
+                       input2.get_float_ptr(),
+                       input1_grad.get_float_ptr(),
+                       input2_grad.get_float_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_half_ptr(),
+                       input1.get_half_ptr(),
+                       input2.get_half_ptr(),
+                       input1_grad.get_half_ptr(),
+                       input2_grad.get_half_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume());
+  assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume());
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  int num_peft_requests = 0;
+  int num_peft_tokens = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+      num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  if (num_peft_requests == 0) {
+    // No PEFT requests
+    return;
+  } else {
+    // Otherwise assume at most 1 peft request
+    assert(num_peft_requests == 1);
+    assert(num_peft_tokens >= 1);
+  }
+  int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int num_elements = in_dim * num_peft_tokens;
+
+  if (m->input_type[0] == DT_FLOAT) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       num_elements,
+                       output_grad.get_float_ptr(),
+                       static_cast<float const *>(m->input_activation),
+                       static_cast<float const *>(m->input_activation) +
+                           num_peft_tokens * in_dim,
+                       input1_grad.get_float_ptr(),
+                       input2_grad.get_float_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       num_elements,
+                       output_grad.get_half_ptr(),
+                       static_cast<half const *>(m->input_activation),
+                       static_cast<half const *>(m->input_activation) +
+                           num_peft_tokens * in_dim,
+                       input1_grad.get_half_ptr(),
+                       input2_grad.get_half_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 590b641b5a..929d557a17 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -22,7 +22,7 @@ namespace FlexFlow {
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
                                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
 }
@@ -45,9 +45,44 @@ __global__ void SigmoidSiluMultiKernel(int num_elements,
   }
 }
 
+template <typename T>
+__global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
+                                               T const *output_grad_ptr,
+                                               T const *input1_ptr,
+                                               T const *input2_ptr,
+                                               T *input1_grad_ptr,
+                                               T *input2_grad_ptr,
+                                               bool reset_input_grad1,
+                                               bool reset_input_grad2) {
+  CUDA_KERNEL_LOOP(i, num_elements) {
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+
+    if (reset_input_grad2) {
+      input2_grad_ptr[i] =
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    } else {
+      input2_grad_ptr[i] +=
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    }
+    T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
+    if (reset_input_grad1) {
+      input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
+    } else {
+      input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    }
+    T sig_grad = ss_grad_val * input1_ptr[i];
+
+    float x1_grad_val = static_cast<float>(sig_grad);
+    x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val);
+    input1_grad_ptr[i] += T(x1_grad_val);
+  }
+}
+
 /*static*/
 void SigmoidSiluMulti::inference_kernel_wrapper(
-    SigmoidSiluMultiMeta const *m,
+    SigmoidSiluMultiMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
     GenericTensorAccessorW const &output) {
@@ -64,6 +99,83 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t input_tensor_size =
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
+        size_t activation_size_needed =
+            2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    input1.get_float_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    input_tensor_size,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_float_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    input1.get_half_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    input_tensor_size,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_half_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
@@ -95,4 +207,152 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void SigmoidSiluMulti::backward_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input1,
+    GenericTensorAccessorR const &input2,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(output_grad.domain.get_volume(),
+                                               output_grad.get_float_ptr(),
+                                               input1.get_float_ptr(),
+                                               input2.get_float_ptr(),
+                                               input1_grad.get_float_ptr(),
+                                               input2_grad.get_float_ptr(),
+                                               m->reset_input_grads[0],
+                                               m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(output_grad.domain.get_volume(),
+                                               output_grad.get_half_ptr(),
+                                               input1.get_half_ptr(),
+                                               input2.get_half_ptr(),
+                                               input1_grad.get_half_ptr(),
+                                               input2_grad.get_half_ptr(),
+                                               m->reset_input_grads[0],
+                                               m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume());
+  assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume());
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  int num_peft_requests = 0;
+  int num_peft_tokens = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+      num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  if (num_peft_requests == 0) {
+    // No PEFT requests
+    return;
+  } else {
+    // Otherwise assume at most 1 peft request
+    assert(num_peft_requests == 1);
+    assert(num_peft_tokens >= 1);
+  }
+  int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int num_elements = in_dim * num_peft_tokens;
+
+  if (m->input_type[0] == DT_FLOAT) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(
+        num_elements,
+        output_grad.get_float_ptr(),
+        static_cast<float const *>(m->input_activation),
+        static_cast<float const *>(m->input_activation) +
+            num_peft_tokens * in_dim,
+        input1_grad.get_float_ptr(),
+        input2_grad.get_float_ptr(),
+        m->reset_input_grads[0],
+        m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(
+        num_elements,
+        output_grad.get_half_ptr(),
+        static_cast<half const *>(m->input_activation),
+        static_cast<half const *>(m->input_activation) +
+            num_peft_tokens * in_dim,
+        input1_grad.get_half_ptr(),
+        input2_grad.get_half_ptr(),
+        m->reset_input_grads[0],
+        m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 03618423be..a02d88b98b 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -86,7 +86,7 @@ SoftmaxParams Softmax::get_params() const {
   SoftmaxParams params;
   params.layer_guid = this->layer_guid;
   params.dim = this->dim;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -270,52 +270,12 @@ OpMeta *Softmax::init_task(Task const *task,
     domain = input_domain;
   }
   SoftmaxMeta *m = new SoftmaxMeta(handle, softmax, domain);
-  m->input_type = softmax->inputs[0]->data_type;
-  m->output_type = softmax->outputs[0]->data_type;
   // checkCUDNN(cudnnCreateTensorDescriptor(&m->outputTensor));
   std::strcpy(m->op_name, softmax->name);
   m->layer_guid = softmax->layer_guid;
   return m;
 }
 
-FutureMap Softmax::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  size_t machine_view_hash = view->hash();
-  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
-            << std::endl; */
-  IndexLauncher launcher(SOFTMAX_INF_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  return runtime->execute_index_space(ctx, launcher);
-}
-
 void Softmax::forward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -354,17 +314,11 @@ void Softmax::forward_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  if (m->output_type == DT_HALF) {
-    forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
-  } else if (m->output_type == DT_FLOAT) {
-    forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else {
-    assert(false && "Unsupported data type");
-  }
+  forward_kernel_wrapper(m, input, output);
 }
 
 void Softmax::backward(FFModel const &ff) {
@@ -402,52 +356,69 @@ void Softmax::backward_task(Task const *task,
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
-  switch (in_domain.get_dim()) {
-#define DIMFUNC(DIM)                                                           \
-  case DIM:                                                                    \
-    if (m->output_type == DT_HALF) {                                           \
-      return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
-    } else if (m->output_type == DT_FLOAT) {                                   \
-      return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
-    } else {                                                                   \
-      assert(false && "Unsupported data type");                                \
-    }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      assert(false);
-  }
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(m, input_grad, output_grad);
 }
 
-/*
-  regions[0](I/O): input_grad
-  regions[1](I): output_grad
-*/
-// Note that the backward task of softmax is actually a no op (i.e., input_grad
-// = output_grad) since the upstream cross_entropy_loss function computes
-// performs softmax_cross_entropy_loss to avoid intermediate zeros
-template <typename DT, int NDIM>
-void Softmax::backward_task_with_dim(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // const Softmax* softmax = (Softmax*) task->args;
-  SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
-  TensorAccessorW<DT, NDIM> acc_input_grad(regions[0],
-                                           task->regions[0],
-                                           FID_DATA,
-                                           ctx,
-                                           runtime,
-                                           true /*readOutput*/);
-  TensorAccessorR<DT, NDIM> acc_output_grad(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // make sure the image indices match!
-  assert(acc_input_grad.rect == acc_output_grad.rect);
-
-  backward_kernel_wrapper(
-      m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume());
+FutureMap Softmax::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SOFTMAX_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // if this is the last operator, we add the region below in order to copy the
+  // output to the grad tensor
+  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
+  int last_op = ff.operators.size() - 1;
+  assert(ff.operators[last_op]->op_type == OP_ARGMAX ||
+         ff.operators[last_op]->op_type == OP_ARG_TOPK ||
+         ff.operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  if (ff.operators[last_op] == this) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(2, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
 }
 
 void Softmax::inference_task(Task const *task,
@@ -455,8 +426,8 @@ void Softmax::inference_task(Task const *task,
                              Context ctx,
                              Runtime *runtime) {
   assert(task->regions.size() == regions.size());
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
+  assert(regions.size() == 3 || regions.size() == 2);
+  bool is_last_op = (regions.size() == 3);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
@@ -465,16 +436,19 @@ void Softmax::inference_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  if (m->output_type == DT_HALF) {
-    forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
-  } else if (m->output_type == DT_FLOAT) {
-    forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else {
-    assert(false && "Unsupported data type");
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad;
+  if (is_last_op) {
+    output_grad = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                   regions[2],
+                                                   task->regions[2],
+                                                   FID_DATA,
+                                                   ctx,
+                                                   runtime);
   }
+  inference_kernel_wrapper(m, bc, is_last_op, input, output, output_grad);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -483,6 +457,73 @@ void Softmax::inference_task(Task const *task,
   }
 }
 
+FutureMap Softmax::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void Softmax::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  Domain in_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Softmax::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
+}
+
 bool Softmax::get_int_parameter(PMParameter para, int *value) const {
   switch (para) {
     case PM_SOFTMAX_DIM:
@@ -508,29 +549,35 @@ bool Softmax::measure_operator_cost(Simulator *sim,
 
   sim->free_all();
   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+  GenericTensorAccessorR input_acc(DT_FLOAT, sub_input.get_domain(), input_ptr);
   assert(input_ptr != NULL);
   cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+  GenericTensorAccessorW output_acc(
+      DT_FLOAT, sub_output.get_domain(), output_ptr);
   assert(output_ptr != NULL);
   cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
   std::function<void()> forward, backward;
-  forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); };
+  forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
   if (sim->computationMode == COMP_MODE_TRAINING) {
     float *input_grad_ptr =
         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW input_grad_acc(
+        DT_FLOAT, sub_input.get_domain(), input_grad_ptr);
     assert(input_grad_ptr != NULL);
     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
     float *output_grad_ptr =
         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW output_grad_acc(
+        DT_FLOAT, sub_output.get_domain(), output_grad_ptr);
     assert(output_grad_ptr != NULL);
     cost_metrics.outputs_memory +=
         cost_metrics.total_mem_diff_from(sim->offset);
     backward = [&] {
-      backward_kernel_wrapper(
-          m, input_grad_ptr, output_grad_ptr, sub_output.get_volume());
+      backward_kernel_wrapper(m, input_grad_acc, output_grad_acc);
     };
   }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 68d3a4c205..52da51fb26 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -850,7 +850,7 @@ SpecIncMultiHeadSelfAttentionParams
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index b1687d12a2..aebd5e8892 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -141,7 +141,7 @@ template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             BeamSearchBatchConfig const *bc,
                             hipStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   // printf("curr depth: %d\n", curr_depth);
   // assert(curr_depth < 3);
@@ -200,15 +200,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
   // int qkv_block_size =
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index a00ea9c95f..4688a8233c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -361,7 +361,7 @@ template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             BeamSearchBatchConfig const *bc,
                             cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
@@ -471,17 +471,18 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
@@ -541,20 +542,9 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     DT const *A = static_cast<DT *>(m->devQKVProjArray) +
                   bc->requestsInfo[i].first_token_offset_in_batch *
                       m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-
-    // print_tensor<float>((float*)A, 32, "A");
     DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+    DT *C = static_cast<DT *>(m->qk_prods);
 
-    // if (i == 0 && sub_req_id == 0 &&
-    //     bc->beam_slots.at(0).current_depth == 1) {
-    //   int offset = (float *)B - m->keyCache;
-    //   printf("key cache offset %d\n", kt_req_block_size);
-    // }
-    // To get C, skip over QK^T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods) +
-            m->num_q_heads * tokens_prev_requests_squares;
     checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                          CUBLAS_OP_T,
                                          CUBLAS_OP_N,
@@ -854,29 +844,15 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
     beam_token_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
-
+        static_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
+            handler.batch_config_metadata->beamTokenInfo);
     beam_request_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BeamSearchBatchConfig::beamTokenInfo));
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-        sizeof(BatchConfig::causalMask));
+        static_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
+            handler.batch_config_metadata->beamRequestsInfo);
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/ops/split.cc b/src/ops/split.cc
index 7c6b631b20..92cfbd49e9 100644
--- a/src/ops/split.cc
+++ b/src/ops/split.cc
@@ -50,7 +50,7 @@ SplitParams Split::get_params() const {
   SplitParams params;
   params.splits = this->splits;
   params.legion_axis = this->legion_axis;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index 7d30a8aff3..0e88befa68 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -87,7 +87,7 @@ TopKParams TopK::get_params() const {
   TopKParams params;
   params.k = this->k;
   params.sorted = this->sorted;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -226,7 +226,7 @@ OpMeta *TopK::init_task(Task const *task,
                         Runtime *runtime) {
   TopK *topk = (TopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  TopKMeta *m = new TopKMeta(handle);
+  TopKMeta *m = new TopKMeta(handle, topk);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
   m->sorted = topk->sorted;
@@ -474,7 +474,7 @@ bool TopK::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  TopKMeta *m = new TopKMeta(sim->handler);
+  TopKMeta *m = new TopKMeta(sim->handler, this);
   m->sorted = sorted;
 
   // allocate
diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp
index b6e898b654..303c6e85e9 100644
--- a/src/ops/topk.cpp
+++ b/src/ops/topk.cpp
@@ -513,6 +513,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m,
   // TODO: missing profiling here
 }
 
-TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {}
+TopKMeta::TopKMeta(FFHandler handler, TopK const *topk)
+    : OpMeta(handler, topk) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/topk.cu b/src/ops/topk.cu
index cc87ee8a42..cfb2bf6448 100644
--- a/src/ops/topk.cu
+++ b/src/ops/topk.cu
@@ -509,6 +509,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m,
   }
 }
 
-TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {}
+TopKMeta::TopKMeta(FFHandler handler, TopK const *topk)
+    : OpMeta(handler, topk) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc
index 7a179c4f7d..bffde477de 100644
--- a/src/ops/transpose.cc
+++ b/src/ops/transpose.cc
@@ -51,7 +51,7 @@ TransposeParams Transpose::get_params() const {
   for (int i = 0; i < outputs[0]->num_dims; i++) {
     params.perm.push_back(this->perm[i]);
   }
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -193,7 +193,7 @@ OpMeta *Transpose::init_task(Task const *task,
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
 
-  TransposeMeta *m = new TransposeMeta(handle);
+  TransposeMeta *m = new TransposeMeta(handle, transpose);
   transpose->init_meta(m, in_domain, out_domain);
   m->profiling = transpose->profiling;
   m->inference_debugging = transpose->inference_debugging;
@@ -320,7 +320,7 @@ bool Transpose::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  TransposeMeta *m = sim->transpose_meta;
+  TransposeMeta *m = new TransposeMeta(sim->handler, this);
   this->init_meta(m, sub_input.get_domain(), sub_output.get_domain());
 
   sim->free_all();
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index df722a3d51..132a48be40 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -924,7 +924,7 @@ TreeIncMultiHeadSelfAttentionParams
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
   params.tensor_parallelism_degree = this->tensor_parallelism_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 26291fb3b4..890d32bc87 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -16,6 +16,8 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
@@ -26,11 +28,333 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 
+#define WARP_SIZE 32
+
 using namespace Kernels::IncMultiHeadAttention;
 
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_attention_kernel_fused_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int const max_seq_length,
+    int const max_token_per_batch,
+    int per_head_size,
+    int hidden_size,
+    BatchConfig::PerRequestInfo *request_infos,
+    int num_heads,
+    int num_requests,
+    BatchConfig::BitMask *causalMask,
+    bool *request_completed,
+    int qk_smem_sz) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // request idx
+  int const request_idx = blockIdx.y;
+
+  int const batch_config_request_id =
+      request_infos[request_idx].batch_config_request_id;
+
+  int const first_step = 0;
+
+  int const tlength =
+      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+  int const qlength =
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+
+  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
+
+  int first_token_idx = 0;
+  for (int r = 0; r < batch_config_request_id; r++) {
+    first_token_idx +=
+        request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch;
+  }
+
+  bool prompt_phase = request_infos[batch_config_request_id].prompt_phase;
+  int q_start =
+      request_infos[batch_config_request_id].first_token_depth_in_request;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_ + qk_smem_sz);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+
+  for (int qi = 0; qi < qlength; qi += 1) {
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
+          ii * THREADS_PER_KEY * K_VEC_SIZE);
+
+      // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
+      //     printf("laod q %d,  %d %.10f\n",
+      //     request_idx,
+      //            qi,q_vecs[ki_o][ii].x);
+      //   }
+    }
+
+    __syncthreads();
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      K_vec k[K_VECS_PER_THREAD];
+      int const ti_circ = ti % max_seq_length;
+
+      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+        if (ti < tlength) {
+          k[ii] = *reinterpret_cast<K_vec const *>(
+              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
+              jj);
+        }
+      }
+      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+
+      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+        bool const mask =
+            prompt_phase ? (qi + q_start < ti)
+                         : (ti >= bitmask.non_tree_cache_size &&
+                            (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                               (1 << qi))));
+
+        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+
+        // if (head_idx == 0 && !mask) {
+        //   printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f,
+        //   %.10f, %d\n",
+        //          request_idx,
+        //          qi,
+        //          ti,
+        //          qk,
+        //          q_vecs[ki_o][0].x,
+        //          k[0].x,
+        //          bitmask.non_tree_cache_size);
+        // }
+        qk_smem[ti - first_step] = mask ? 0.0f : qk;
+      }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+      qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+    }
+
+    // Decompose the thread index into warp and lane.
+    int const warp = tidx / WARP_SIZE;
+    int const lane = tidx % WARP_SIZE;
+
+    // The warp leader writes the max to shared memory.
+    if (lane == 0) {
+      red_smem[warp] = qk_max;
+    }
+
+    // Make sure the products are in shared memory.
+    __syncthreads();
+
+    // The warps finalize the reduction.
+    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+      qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+    }
+
+    // Broadcast to all the threads in the warp.
+    qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0);
+
+    // if (head_idx == 0 && qi == 9 && tidx == 0) {
+    //   printf("tree attn first token qk_max %f\n", qk_max);
+    // }
+
+    float exp_sum = 0.f;
+    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+      bool const mask =
+          prompt_phase ? (q_start + qi < ti)
+                       : (ti >= bitmask.non_tree_cache_size &&
+                          (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                             (1 << qi))));
+      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
+      exp_sum += logit;
+      qk_smem[ti - first_step] = mask ? 0.0f : logit;
+    }
+
+    // Compute the sum.
+    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+    // softmax
+    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+      qk_smem[ti - first_step] *= inv_sum;
+    }
+
+    __syncthreads();
+
+    // value projection
+    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+    // A vector of V elements for the current timestep.
+    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+    // The value computed by this thread.
+    int vo = tidx / THREADS_PER_VALUE;
+    // The hidden dimensions computed by this particular thread.
+    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+    Out_sum out;
+    zero(out);
+
+    // The base pointer for the value in the cache buffer.
+    DT const *v_cache_batch =
+        value_cache + batch_config_request_id * max_seq_length * hidden_size +
+        vi;
+
+    if (Dh == Dh_MAX || vi < Dh) {
+      for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+        // Load the values from the cache.
+        int const ti_circ = ti % max_seq_length;
+        // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti];
+        V_vec v = *reinterpret_cast<V_vec const *>(
+            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+
+        if (ti < tlength) {
+          bool const mask =
+              prompt_phase
+                  ? (q_start + qi < ti)
+                  : (ti >= bitmask.non_tree_cache_size &&
+                     (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                        (1 << qi))));
+          float logit = mask ? 0.0f : qk_smem[ti - first_step];
+          out = FlexFlow::fma(logit, cast_to_float(v), out);
+        }
+      }
+    }
+
+    //   // Make sure we can start writing to shared memory.
+    __syncthreads();
+
+    // Run the final reduction amongst the different groups computing different
+    // partial outputs.
+    if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+      for (int active_groups = V_PER_ITER; active_groups >= 2;
+           active_groups /= 2) {
+
+        // The midpoint in the number of active groups.
+        int midpoint = active_groups / 2;
+
+        // The upper part of active threads store to shared memory.
+        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+              out;
+        }
+        __syncthreads();
+
+        // The bottom warps update their values.
+        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                    out);
+        }
+        __syncthreads();
+      }
+    }
+
+    // Output the final values.
+    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+      convert_from_float(*reinterpret_cast<V_vec *>(
+                             output_ptr + (first_token_idx + qi) * hidden_size +
+                             head_idx * per_head_size + vi),
+                         out);
+      // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) {
+      //   printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n",
+      //          out.x,
+      //          out.y,
+      //          out.z,
+      //          out.w,
+      //          vi,
+      //          (first_token_idx + qi) * hidden_size + head_idx *
+      //          per_head_size +
+      //              vi);
+      // }
+    }
+  }
+}
+
 template <typename DT>
 __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
@@ -45,15 +369,15 @@ __global__ void commit_tokens_kernel(
     int max_seq_len,
     int hidden_size) {
 
-  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) {
+  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
 
-    int token_pos = i / (hidden_size * KV_WEIGHT_NUM);
+    int token_pos = i / (hidden_size);
     int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
     int offset = i % hidden_size;
     assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
 
-    size_t val_idx =
-        token_idx_in_last_batch * 3 * hidden_size + hidden_size + offset;
+    size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size +
+                     hidden_size + offset;
 
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
@@ -89,8 +413,9 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kProjSize,
         m->vProjSize,
         num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
-        BatchConfig::max_sequence_length(),
+        m->num_active_infr_tokens, // number of active tokens in previous batch
+        BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num(),
         m->hidden_size);
   }
 }
@@ -109,12 +434,15 @@ __global__ void update_tree_branch_kv_cache(
     int total_tokens_in_batch,
     int max_seq_len,
     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) {
-    int token_idx = i / (hidden_size * KV_WEIGHT_NUM);
+  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
+
+    int token_idx = i / (hidden_size);
     int offset = i % hidden_size;
 
     token_idx += processed_tokens_in_batch; // get index in the whole batch
-    size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset;
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
@@ -127,6 +455,53 @@ __global__ void update_tree_branch_kv_cache(
   }
 }
 
+template <typename DT>
+__global__ void update_tree_branch_kv_cache_fused(
+    DT const *devQKVProjArray,
+    DT *kCache_ptr,
+    DT *vCache_ptr,
+    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerRequestInfo *request_infos,
+    int qProjSize,
+    int kProjSize,
+    int vProjSize,
+    int num_new_tokens,
+    int max_seq_len,
+    int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) {
+
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+
+    int const req_id = tokenInfos[token_idx].request_index;
+    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+
+    int const request_token_offset =
+        request_infos[req_id].first_token_offset_in_batch;
+    int const first_token_depth =
+        request_infos[req_id].first_token_depth_in_request;
+
+    // if(i % hidden_size == 0){
+    //   printf("update token request id: %d, %d, %d  real id %d, value%.10f\n",
+    //   req_id, token_idx, request_token_offset,(token_idx + first_token_depth
+    //   - request_token_offset), kVal);
+    // }
+    kCache_ptr[req_id * (hidden_size * max_seq_len) +
+               (token_idx + first_token_depth - request_token_offset) *
+                   hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) +
+               (token_idx + first_token_depth - request_token_offset) *
+                   hidden_size +
+               offset] = vVal;
+  }
+}
+
 template <typename DT>
 __global__ void tree_fill_entries_above_diagonal(DT *matrix,
                                                  size_t new_tokens,
@@ -157,13 +532,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
@@ -171,16 +547,20 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   int q_block_size = m->qProjSize;
   int kt_block_size = m->kProjSize;
   int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
+      BatchConfig::max_spec_tree_token_num();
   int vt_block_size = m->vProjSize;
   int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
+      BatchConfig::max_spec_tree_token_num();
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
     }
+    assert(processed_tokens_in_batch ==
+           bc->requestsInfo[i].first_token_offset_in_batch);
     int last_token_idx_of_the_request =
         processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
     while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
@@ -213,7 +593,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
             m->hidden_size);
       }
@@ -335,24 +715,23 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                          MIOPEN_SOFTMAX_MODE_CHANNEL));
       // Matmul softmax(QK^T/sqrt(d_k)) by V
       alpha = 1.0f, beta = 0.0f;
-      m_ = num_new_tokens;
-      n = m->vProjSize;
+      m_ = m->vProjSize;
+      n = num_new_tokens;
       k = total_tokens_in_request;
-      lda = m_, ldb = n * m->num_q_heads, ldc = m_;
-      strideA = num_new_tokens * total_tokens_in_request;
-      strideB = vt_block_size;
-      strideC = num_new_tokens * m->vProjSize;
-      // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      A = C_softmax;
-      // To get B, skip over V^T entries from previous requests (all heads +
+      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      strideA = vt_block_size;
+      strideB = num_new_tokens * total_tokens_in_request;
+      strideC = m->vProjSize;
+      // To get A, skip over V^T entries from previous requests (all heads +
       // padding)
-      B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      B = C_softmax;
       // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
       // requests
       C = static_cast<DT *>(m->attn_heads) +
           processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-
       checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
                                             HIPBLAS_OP_N,
                                             HIPBLAS_OP_T,
@@ -376,45 +755,44 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                             m->num_q_heads,
                                             compute_type,
                                             HIPBLAS_GEMM_DEFAULT));
-
-      // Project to output, save result directly on output tensor
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->oProjSize;
-      k = m->vProjSize * m->num_q_heads;
-      n = num_new_tokens;
-      lda = k, ldb = n, ldc = m_;
-      A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                   m->kProjSize * m->num_q_heads +
-                                   m->vProjSize * m->num_q_heads);
-      B = C;
-      C = static_cast<DT *>(output_ptr) +
-          processed_tokens_in_batch * m->oProjSize;
-
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_T,
-                              HIPBLAS_OP_T,
-                              m_,
-                              n,
-                              k,
-                              &alpha,
-                              A,
-                              hipblas_data_type,
-                              lda,
-                              B,
-                              hipblas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              hipblas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
       processed_tokens_in_batch += num_new_tokens;
     }
     // Before moving to the next request
     // check that we have finished all tokens of the request
     assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
   }
+  // Project to output, save result directly on output tensor
+  DT alpha = 1.0f, beta = 0.0f;
+  int m_ = m->oProjSize;
+  int k = m->vProjSize * m->num_q_heads;
+  int n = processed_tokens_in_batch;
+  int lda = k, ldb = k, ldc = m_;
+  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                         m->kProjSize * m->num_q_heads +
+                                         m->vProjSize * m->num_q_heads);
+  DT const *B = static_cast<DT *>(m->attn_heads);
+  DT *C = static_cast<DT *>(output_ptr);
+
+  checkCUDA(hipblasGemmEx(m->handle.blas,
+                          HIPBLAS_OP_T,
+                          HIPBLAS_OP_T,
+                          m_,
+                          n,
+                          k,
+                          &alpha,
+                          A,
+                          hipblas_data_type,
+                          lda,
+                          B,
+                          hipblas_data_type,
+                          ldb,
+                          &beta,
+                          C,
+                          hipblas_data_type,
+                          ldc,
+                          compute_type,
+                          HIPBLAS_GEMM_DEFAULT));
+
   if (*m->final_bias && shard_id == 0) {
     int parallelism = m->oProjSize * processed_tokens_in_batch;
     int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
@@ -432,7 +810,85 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                        m->oProjSize);
   }
 
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
+  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
+}
+
+#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream)      \
+  smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
+                              BatchConfig::max_sequence_length() +             \
+                                  BatchConfig::max_spec_tree_token_num(),      \
+                              THDS_PER_VALUE,                                  \
+                              THDS_PER_BLOCK,                                  \
+                              bc,                                              \
+                              smem_sz);                                        \
+  compute_attention_kernel_fused_kernel<DT,                                    \
+                                        THDS_PER_BLOCK,                        \
+                                        Dh,                                    \
+                                        Dh_MAX,                                \
+                                        THDS_PER_KEY,                          \
+                                        THDS_PER_VALUE>                        \
+      <<<grid, THDS_PER_BLOCK, smem_sz[1], stream>>>(                          \
+          static_cast<DT *>(m->devQKVProjArray),                               \
+          static_cast<DT *>(m->keyCache),                                      \
+          static_cast<DT *>(m->valueCache),                                    \
+          output_ptr,                                                          \
+          scale,                                                               \
+          BatchConfig::max_sequence_length() +                                 \
+              BatchConfig::BatchConfig::max_spec_tree_token_num(),             \
+          BatchConfig::max_tokens_per_batch(),                                 \
+          m->qProjSize,                                                        \
+          m->hidden_size,                                                      \
+          m->request_infos,                                                    \
+          m->num_q_heads,                                                      \
+          bc->num_active_requests(),                                           \
+          m->causalMask,                                                       \
+          m->request_completed,                                                \
+          smem_sz[0])
+
+template <typename DT>
+void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
+                                    TreeVerifyBatchConfig const *bc,
+                                    DT *output_ptr,
+                                    hipStream_t stream) {
+
+  // update the kv cache
+  //  update K-V cache
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<DT *>(m->keyCache),
+      static_cast<DT *>(m->valueCache),
+      m->token_infos,
+      m->request_infos,
+      m->qProjSize,
+      m->kProjSize,
+      m->vProjSize,
+      num_new_tokens,
+      BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num(),
+      m->hidden_size);
+
+  dim3 grid(m->num_q_heads, bc->num_active_requests());
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  // 0->qk production size, 1->total shared size
+  int smem_sz[2];
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+  } else {
+    assert(false && "a unsupported head size");
+  }
 }
 
 template <typename DT>
@@ -461,21 +917,17 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     }
   }
   // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
+  // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
-  checkCUDA(
-      hipMemcpyAsync(m->committed_token_infos,
-                     &(bc->committed_tokens),
-                     bc->num_tokens_to_commit *
-                         sizeof(TreeVerifyBatchConfig::CommittedTokensInfo),
-                     hipMemcpyHostToDevice,
-                     stream));
+  // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
+  // "\n";
+
   commit_tokens<DT>(m, bc, stream);
 
-  // After commit we update m->num_active_tokens to be the number of active
+  // After commit we update m->num_active_infr_tokens to be the number of active
   // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
+  m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
   // here because we need postion info in infernece 1
   if (m->offload && m->biasSize > 0) {
@@ -483,12 +935,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
         m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
-  checkCUDA(hipMemcpyAsync(m->token_infos,
-                           &(bc->tokensInfo),
-                           bc->num_active_tokens() *
-                               sizeof(TreeVerifyBatchConfig::PerTokenInfo),
-                           hipMemcpyHostToDevice,
-                           stream));
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
@@ -502,11 +948,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
   //    m, bc, stream);
+  // use the new kernel
+  compute_attention_kernel_fused<DT>(
+      m, bc, static_cast<DT *>(m->attn_heads), stream);
+
+  int processed_tokens_in_batch = bc->num_active_tokens();
 
-  // phase 3: Compute attention score
-  // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  compute_attention_kernel(
-      m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+  compute_o_prod_bias(m,
+                      bc,
+                      shard_id,
+                      output_ptr,
+                      weight_ptr,
+                      bias_ptr,
+                      processed_tokens_in_batch,
+                      stream);
 }
 
 } // namespace TreeIncMultiHeadAttention
@@ -622,34 +1077,21 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_kv_heads,
                                     attn->quantization_type,
                                     attn->offload),
-      num_active_tokens(0) {
+      num_active_infr_tokens(0) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-    size_t committed_tokeninfo_size = max_tokens_per_batch;
-    size_t total_size = committed_tokeninfo_size *
-                        sizeof(TreeVerifyBatchConfig::CommittedTokensInfo);
-    if (offload) {
-      // assert that we have enough reserved work space left
-      assert(gpu_mem_allocator.reserved_total_size -
-                 gpu_mem_allocator.reserved_allocated_size >=
-             total_size);
-      committed_token_infos =
-          gpu_mem_allocator
-              .allocate_reserved<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
-    } else {
-      gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst,
-                                               total_size);
-      committed_token_infos =
-          gpu_mem_allocator
-              .allocate_instance<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
-    }
+
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
+    committed_token_infos =
+        static_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+            handler.batch_config_metadata->committed_tokens);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   checkCUDA(hipStreamSynchronize(stream));
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 50c056c816..86c53d7ea1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -12,9 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
-#endif
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
@@ -390,7 +388,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kProjSize,
         m->vProjSize,
         num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
+        m->num_active_infr_tokens, // number of active tokens in previous batch
         BatchConfig::max_sequence_length() +
             BatchConfig::max_spec_tree_token_num(),
         m->hidden_size);
@@ -509,17 +507,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
@@ -571,7 +570,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
             m->hidden_size);
       }
@@ -773,6 +772,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                          ldc,
                          compute_type,
                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
   if (*m->final_bias && shard_id == 0) {
     int parallelism = m->oProjSize * processed_tokens_in_batch;
     int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
@@ -788,7 +788,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                   m->oProjSize);
   }
 
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
+  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
 }
 
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
@@ -896,7 +896,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
 
   // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
+  // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
   // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
@@ -904,9 +904,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   commit_tokens<DT>(m, bc, stream);
 
-  // After commit we update m->num_active_tokens to be the number of active
+  // After commit we update m->num_active_infr_tokens to be the number of active
   // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
+  m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
   // here because we need postion info in infernece 1
   if (m->offload && m->biasSize > 0) {
@@ -1052,7 +1052,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_kv_heads,
                                     attn->quantization_type,
                                     attn->offload),
-      num_active_tokens(0) {
+      num_active_infr_tokens(0) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
@@ -1060,21 +1060,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
 
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
     committed_token_infos =
-        reinterpret_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BatchConfig::causalMask));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::causalMask) +
-        sizeof(TreeVerifyBatchConfig::committed_tokens));
+        static_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+            handler.batch_config_metadata->committed_tokens);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 5d38e28903..52c4ec2e28 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::AllReduce;
 
 /* Params */
 bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) {
-  return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim;
+  return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool AllReduceParams::is_valid(ParallelTensorShape const &input) const {
@@ -55,7 +56,7 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const {
 AllReduceParams AllReduce::get_params() const {
   AllReduceParams params;
   params.allreduce_legion_dim = this->allreduce_dim;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -110,6 +111,7 @@ OpMeta *AllReduce::init_task(Task const *task,
   meta->input_type[0] = ar->inputs[0]->data_type;
   meta->output_type[0] = ar->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, ar->name);
   return meta;
 }
 
@@ -146,6 +148,102 @@ void AllReduce::init(FFModel const &ff) {
   set_opmeta_from_futuremap(ff, fm);
 }
 
+void AllReduce::forward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_forward(ff, argmap);
+  IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID,
+                         outputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void AllReduce::forward_task(Task const *task,
+                             std::vector<PhysicalRegion> const &regions,
+                             Context ctx,
+                             Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  forward_kernel_wrapper(m, input, output);
+}
+
+void AllReduce::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID,
+                         inputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         inputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void AllReduce::backward_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  backward_kernel_wrapper(m, input_grad, output_grad);
+}
+
 void AllReduce::init_inference(FFModel const &ff,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
@@ -224,64 +322,103 @@ FutureMap AllReduce::inference(FFModel const &ff,
   return runtime->execute_index_space(ctx, launcher);
 }
 
-void AllReduce::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = outputs[0]->parallel_is;
-  assert(numOutputs == 1);
-  assert(numInputs == 1);
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID,
-                         outputs[0]->parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+/*static*/
+void AllReduce::inference_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta *m = *((AllReduceMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  inference_kernel_wrapper(m, bc, input, output);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    AllReduce::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {output});
+  }
 }
 
-void AllReduce::backward(FFModel const &ff) {
+FutureMap AllReduce::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
   assert(numOutputs == 1);
   assert(numInputs == 1);
-  IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID,
-                         inputs[0]->parallel_is,
-                         TaskArgument(NULL, 0),
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
                          argmap,
                          Predicate::TRUE_PRED,
                          false /*must*/,
                          0 /*mapper_id*/,
-                         inputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void AllReduce::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta *m = *((AllReduceMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    AllReduce::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
 }
 
 bool AllReduce::measure_operator_cost(Simulator *sim,
@@ -318,62 +455,6 @@ bool AllReduce::append_parallel_op_info(
   return true;
 }
 
-/*static*/
-void AllReduce::inference_task(Task const *task,
-                               std::vector<PhysicalRegion> const &regions,
-                               Context ctx,
-                               Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input.data_type == output.data_type);
-  inference_kernel_wrapper(m, bc, input, output);
-}
-
-/*static*/
-void AllReduce::forward_task(Task const *task,
-                             std::vector<PhysicalRegion> const &regions,
-                             Context ctx,
-                             Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input.data_type == output.data_type);
-  forward_kernel_wrapper(m, input, output);
-}
-
-void AllReduce::backward_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input_grad.data_type == output_grad.data_type);
-  backward_kernel_wrapper(m, input_grad, output_grad);
-}
-
 }; // namespace FlexFlow
 
 namespace std {
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index acc5c414c7..ce9c032350 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Combine;
 /* Params */
 bool operator==(CombineParams const &lhs, CombineParams const &rhs) {
   return lhs.combine_legion_dim == rhs.combine_legion_dim &&
-         lhs.combine_degree == rhs.combine_degree;
+         lhs.combine_degree == rhs.combine_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool CombineParams::is_valid(ParallelTensorShape const &input) const {
@@ -58,7 +59,7 @@ CombineParams Combine::get_params() const {
   CombineParams params;
   params.combine_legion_dim = this->combine_dim;
   params.combine_degree = this->combine_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -102,10 +103,11 @@ OpMeta *Combine::init_task(Task const *task,
                            Runtime *runtime) {
   Combine *cmb = (Combine *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  CombineMeta *m = new CombineMeta(handle);
+  CombineMeta *m = new CombineMeta(handle, cmb);
   m->input_type[0] = cmb->inputs[0]->data_type;
   m->output_type[0] = cmb->outputs[0]->data_type;
   assert(m->input_type[0] == m->output_type[0]);
+  std::strcpy(m->op_name, cmb->name);
   return m;
 }
 
@@ -202,12 +204,23 @@ void Combine::create_input_partition_inference(
   assert(ff.config.computationMode == COMP_MODE_INFERENCE);
   assert(batch_outputs[0]->part != LogicalPartition::NO_PART);
   assert(batch_inputs[0]->part != LogicalPartition::NO_PART);
-  // input_lp is a disjoint partition
+  // partition batch_inputs[0]->region into inference_input_lps[batch_inputs[0]]
+  // according to the partitioning of batch_outputs[0] (i.e. make the
+  // partitioned dimension whole again by combining the partitions)
   ff.create_disjoint_partition(batch_outputs[0]->num_dims,
                                batch_outputs[0]->dims,
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  // partition batch_outputs[0]->region_grad into
+  // inference_output_grad_lps[batch_outputs[0]] according to the partitioning
+  // of batch_inputs[0] (i.e. restore the partition in the dimension that was
+  // combined in the forward pass)
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 FutureMap Combine::inference(FFModel const &ff,
@@ -226,7 +239,7 @@ FutureMap Combine::inference(FFModel const &ff,
   size_t machine_view_hash =
       mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  IndexLauncher launcher(COMBINE_FWD_TASK_ID,
+  IndexLauncher launcher(COMBINE_INF_TASK_ID,
                          batch_outputs[0]->parallel_is,
                          TaskArgument(nullptr, 0),
                          argmap,
@@ -234,6 +247,7 @@ FutureMap Combine::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(inference_input_lps[batch_inputs[0]],
                         0 /*projection id*/,
@@ -278,6 +292,52 @@ void Combine::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap Combine::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = inputs[0]->data_type;
+
+  // Warning: we need to use batch_inputs[0] here, instead of the usual
+  // batch_outputs[0]
+  parallel_is = batch_inputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view;
+
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(&data_type, sizeof(DataType)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void Combine::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -357,6 +417,37 @@ tl::optional<RecordFormatter> Combine::as_dot() const {
   return rf;
 }
 
+/*static*/
+void Combine::inference_task(Task const *task,
+                             std::vector<PhysicalRegion> const &regions,
+                             Context ctx,
+                             Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  CombineMeta const *m = *((CombineMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+  DataType data_type = m->input_type[0];
+  if (m->inference_debugging) {
+    std::cout << "INF " << m->op_name << std::endl;
+  }
+  if (data_type == DT_HALF) {
+    forward_task_with_type<half>(task, regions, ctx, runtime);
+  } else if (data_type == DT_FLOAT) {
+    forward_task_with_type<float>(task, regions, ctx, runtime);
+  } else if (data_type == DT_DOUBLE) {
+    forward_task_with_type<double>(task, regions, ctx, runtime);
+  } else if (data_type == DT_INT32) {
+    forward_task_with_type<int32_t>(task, regions, ctx, runtime);
+  } else if (data_type == DT_INT64) {
+    forward_task_with_type<int64_t>(task, regions, ctx, runtime);
+  } else {
+    assert(false && "Unsupported data type in Combine forward");
+  }
+}
+
 /*static*/
 void Combine::forward_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
@@ -400,6 +491,56 @@ void Combine::forward_task_with_type(Task const *task,
   forward_kernel<DT>(input_ptr, output_ptr, output_domain.get_volume());
 }
 
+void Combine::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  // CombineMeta const *m = *((CombineMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  // TODO: figure out why m->output_type[0] or m->input_type[0] are not working
+  DataType data_type = *((DataType *)task->args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  assert(input_grad.data_type == data_type);
+  assert(output_grad.domain == input_grad.domain);
+  CombineMeta const *m = *((CombineMeta **)task->local_args);
+  int shard_id = task->index_point.point_data[0];
+  if (shard_id == 0 && m->inference_debugging) {
+    // m is null when shard_id > 0 for some reason
+    std::cout << "BWD " << m->op_name << std::endl;
+  }
+  if (data_type == DT_HALF) {
+    backward_kernel<half>(output_grad.get_half_ptr(),
+                          input_grad.get_half_ptr(),
+                          output_grad.domain.get_volume());
+  } else if (data_type == DT_FLOAT) {
+    backward_kernel<float>(output_grad.get_float_ptr(),
+                           input_grad.get_float_ptr(),
+                           output_grad.domain.get_volume());
+  } else if (data_type == DT_DOUBLE) {
+    backward_kernel<double>(output_grad.get_double_ptr(),
+                            input_grad.get_double_ptr(),
+                            output_grad.domain.get_volume());
+  } else if (data_type == DT_INT32) {
+    backward_kernel<int32_t>(output_grad.get_int32_ptr(),
+                             input_grad.get_int32_ptr(),
+                             output_grad.domain.get_volume());
+  } else if (data_type == DT_INT64) {
+    backward_kernel<int64_t>(output_grad.get_int64_ptr(),
+                             input_grad.get_int64_ptr(),
+                             output_grad.domain.get_volume());
+  } else {
+    assert(false && "Unsupported data type in Combine backward");
+  }
+}
+
 void Combine::backward_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
diff --git a/src/parallel_ops/fused_parallel_op.cc b/src/parallel_ops/fused_parallel_op.cc
index 1a76cbfc40..dec7b20fb2 100644
--- a/src/parallel_ops/fused_parallel_op.cc
+++ b/src/parallel_ops/fused_parallel_op.cc
@@ -59,7 +59,7 @@ FusedParallelOpParams FusedParallelOp::get_params() const {
   std::vector<ParallelOpInfo> ops(std::begin(this->parallel_ops),
                                   std::end(this->parallel_ops));
   params.parallel_ops = ops;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp
index 8d7e20e395..7067035465 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cpp
+++ b/src/parallel_ops/kernels/allreduce_kernels.cpp
@@ -20,26 +20,23 @@
 namespace FlexFlow {
 
 AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+void forward_kernel_wrapper(AllReduceMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
-  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          num_elements,
+                          input.domain.get_volume(),
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -49,19 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
-                            GenericTensorAccessorR const &input,
-                            GenericTensorAccessorW const &output) {
+void backward_kernel_wrapper(AllReduceMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          input.domain.get_volume(),
+                          num_elements,
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -71,10 +76,29 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void backward_kernel_wrapper(AllReduceMeta const *m,
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
-  assert(false && "To be implemented");
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
 }
 
 } // namespace AllReduce
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2c000137a1..3041f9adf9 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -13,32 +13,30 @@
  * limitations under the License.
  */
 
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
 AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+void forward_kernel_wrapper(AllReduceMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
-  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          num_elements,
+                          input.domain.get_volume(),
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -48,18 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
-                            GenericTensorAccessorR const &input,
-                            GenericTensorAccessorW const &output) {
+void backward_kernel_wrapper(AllReduceMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          input.domain.get_volume(),
+                          num_elements,
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -69,10 +76,23 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void backward_kernel_wrapper(AllReduceMeta const *m,
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
-  assert(false && "To be implemented");
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens();
+  size_t data_size = data_type_size(output_grad.data_type);
+  checkCUDA(cudaMemcpyAsync(input_grad.ptr,
+                            output_grad.ptr,
+                            hidden_dim_size * num_elements * data_size,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
 }
 
 } // namespace AllReduce
diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp
index d6e9568223..2a29be1ad4 100644
--- a/src/parallel_ops/kernels/combine_kernels.cpp
+++ b/src/parallel_ops/kernels/combine_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/parallel_ops/kernels/combine_kernels.h"
+#include "flexflow/parallel_ops/combine.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {}
+CombineMeta::CombineMeta(FFHandler handler, Combine const *comb)
+    : OpMeta(handler, comb) {}
 
 namespace Kernels {
 namespace Combine {
diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu
index 1ab79a7944..5809e2d4f3 100644
--- a/src/parallel_ops/kernels/combine_kernels.cu
+++ b/src/parallel_ops/kernels/combine_kernels.cu
@@ -13,12 +13,14 @@
  * limitations under the License.
  */
 
+#include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/kernels/combine_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {}
+CombineMeta::CombineMeta(FFHandler handler, Combine const *comb)
+    : OpMeta(handler, comb) {}
 
 namespace Kernels {
 namespace Combine {
diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cpp b/src/parallel_ops/kernels/parallel_identity_kernels.cpp
new file mode 100644
index 0000000000..8378231fb2
--- /dev/null
+++ b/src/parallel_ops/kernels/parallel_identity_kernels.cpp
@@ -0,0 +1,97 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/utils/hip_helper.h"
+#include <hip/hip_runtime.h>
+
+namespace FlexFlow {
+
+ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle,
+                                           ParallelIdentity const *reduct)
+    : OpMeta(handle, reduct) {}
+
+namespace Kernels {
+namespace ParallelIdentity {
+
+void forward_kernel_wrapper(ParallelIdentityMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t data_size = data_type_size(input.data_type);
+  // copy input to output
+  checkCUDA(hipMemcpyAsync(output.ptr,
+                           input.ptr,
+                           input.domain.get_volume() * data_size,
+                           hipMemcpyDeviceToDevice,
+                           stream));
+}
+
+void backward_kernel_wrapper(ParallelIdentityMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(ParallelIdentityMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens();
+  size_t data_size = data_type_size(input.data_type);
+  checkCUDA(hipMemcpyAsync(output.ptr,
+                           input.ptr,
+                           hidden_dim_size * num_elements * data_size,
+                           hipMemcpyDeviceToDevice,
+                           stream));
+}
+
+void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators");
+#endif
+}
+
+} // namespace ParallelIdentity
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu
new file mode 100644
index 0000000000..6800f3ab16
--- /dev/null
+++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu
@@ -0,0 +1,96 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+
+ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle,
+                                           ParallelIdentity const *reduct)
+    : OpMeta(handle, reduct) {}
+
+namespace Kernels {
+namespace ParallelIdentity {
+
+void forward_kernel_wrapper(ParallelIdentityMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t data_size = data_type_size(input.data_type);
+  // copy input to output
+  checkCUDA(cudaMemcpyAsync(output.ptr,
+                            input.ptr,
+                            input.domain.get_volume() * data_size,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
+}
+
+void backward_kernel_wrapper(ParallelIdentityMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(ParallelIdentityMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens();
+  size_t data_size = data_type_size(input.data_type);
+  checkCUDA(cudaMemcpyAsync(output.ptr,
+                            input.ptr,
+                            hidden_dim_size * num_elements * data_size,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
+}
+
+void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators");
+#endif
+}
+
+} // namespace ParallelIdentity
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/parallel_ops/kernels/partition_kernels.cpp b/src/parallel_ops/kernels/partition_kernels.cpp
index cfd76c0f18..bd1c96d4c7 100644
--- a/src/parallel_ops/kernels/partition_kernels.cpp
+++ b/src/parallel_ops/kernels/partition_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/parallel_ops/kernels/partition_kernels.h"
+#include "flexflow/parallel_ops/partition.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {}
+RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart)
+    : OpMeta(handler, repart) {}
 
 namespace Kernels {
 namespace Repartition {
diff --git a/src/parallel_ops/kernels/partition_kernels.cu b/src/parallel_ops/kernels/partition_kernels.cu
index 08008f1035..3a39b39fe4 100644
--- a/src/parallel_ops/kernels/partition_kernels.cu
+++ b/src/parallel_ops/kernels/partition_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/parallel_ops/kernels/partition_kernels.h"
+#include "flexflow/parallel_ops/partition.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {}
+RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart)
+    : OpMeta(handler, repart) {}
 
 namespace Kernels {
 namespace Repartition {
diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp
index 2a3fe5cca1..1f3e8e0962 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cpp
+++ b/src/parallel_ops/kernels/reduction_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace Reduction {
diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu
index 34ae8007da..df7630976b 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cu
+++ b/src/parallel_ops/kernels/reduction_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace Reduction {
diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp
index 1647f014be..f49e0d4eb0 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cpp
+++ b/src/parallel_ops/kernels/replicate_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl)
-    : OpMeta(handle) {}
+    : OpMeta(handle, repl) {}
 
 namespace Kernels {
 namespace Replicate {
diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu
index 35bc109bd3..0b5c434aa6 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cu
+++ b/src/parallel_ops/kernels/replicate_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl)
-    : OpMeta(handle) {}
+    : OpMeta(handle, repl) {}
 
 namespace Kernels {
 namespace Replicate {
diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc
new file mode 100644
index 0000000000..883910ae09
--- /dev/null
+++ b/src/parallel_ops/parallel_identity.cc
@@ -0,0 +1,474 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/parallel_ops/parallel_identity.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/model.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
+#include "flexflow/utils/hash_utils.h"
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::LogicalPartition;
+using Legion::LogicalRegion;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+
+using namespace FlexFlow::Kernels::ParallelIdentity;
+
+/* Params */
+bool operator==(ParallelIdentityParams const &lhs,
+                ParallelIdentityParams const &rhs) {
+  return lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim &&
+         std::strcmp(lhs.name, rhs.name) == 0;
+}
+
+bool ParallelIdentityParams::is_valid(ParallelTensorShape const &input) const {
+  return input.is_valid();
+}
+
+ParallelIdentityParams ParallelIdentity::get_params() const {
+  ParallelIdentityParams params;
+  params.parallel_identity_legion_dim = this->parallel_identity_dim;
+  if (strlen(this->name) < MAX_OPNAME) {
+    strcpy(params.name, this->name);
+  }
+  return params;
+}
+
+ParallelIdentity::ParallelIdentity(FFModel &model,
+                                   const ParallelTensor _input,
+                                   int _parallel_identity_legion_dim,
+                                   char const *name)
+    : ParallelOp(model, OP_PARALLEL_IDENTITY, name, _input),
+      parallel_identity_dim(_parallel_identity_legion_dim) {
+  int numdim = _input->num_dims;
+  ParallelDim dims[MAX_TENSOR_DIM];
+  for (int i = 0; i < numdim; i++) {
+    dims[i] = _input->dims[i];
+  }
+  assert(dims[parallel_identity_dim].degree > 1);
+  // ParallelTensorBase::update_parallel_ids(numdim, dims);
+  outputs[0] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, _input->data_type, this);
+}
+
+ParallelIdentity::ParallelIdentity(FFModel &model,
+                                   ParallelIdentityParams const &params,
+                                   ParallelTensor const input,
+                                   char const *name)
+    : ParallelIdentity(
+          model, input, params.parallel_identity_legion_dim, params.name) {}
+
+void ParallelIdentity::create_input_partition(FFModel &ff) {
+  // Do nothing
+  return;
+}
+
+void ParallelIdentity::create_input_partition_inference(
+    FFModel &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs) {
+  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
+  assert(batch_outputs[0]->part != LogicalPartition::NO_PART);
+  assert(batch_inputs[0]->part != LogicalPartition::NO_PART);
+  // Do nothing
+  return;
+}
+
+OpMeta *ParallelIdentity::init_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  ParallelIdentity *ar = (ParallelIdentity *)task->args;
+  FFHandler handle = *((FFHandler const *)task->local_args);
+  ParallelIdentityMeta *meta = new ParallelIdentityMeta(handle, ar);
+  meta->input_type[0] = ar->inputs[0]->data_type;
+  meta->output_type[0] = ar->outputs[0]->data_type;
+  assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, ar->name);
+  return meta;
+}
+
+void ParallelIdentity::init(FFModel const &ff) {
+  ArgumentMap argmap;
+  parallel_is = outputs[0]->parallel_is;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_init(ff, argmap);
+  IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(ParallelIdentity)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap(ff, fm);
+}
+
+void ParallelIdentity::forward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_forward(ff, argmap);
+  IndexLauncher launcher(PARALLEL_IDENTITY_FWD_TASK_ID,
+                         outputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void ParallelIdentity::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  forward_kernel_wrapper(m, input, output);
+}
+
+void ParallelIdentity::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  IndexLauncher launcher(PARALLEL_IDENTITY_BWD_TASK_ID,
+                         inputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         inputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void ParallelIdentity::backward_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  backward_kernel_wrapper(m, input_grad, output_grad);
+}
+
+void ParallelIdentity::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  parallel_is = batch_outputs[0]->parallel_is;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(ParallelIdentity)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
+}
+
+FutureMap ParallelIdentity::inference(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(PARALLEL_IDENTITY_INF_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void ParallelIdentity::inference_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  inference_kernel_wrapper(m, bc, input, output);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    ParallelIdentity::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {output});
+  }
+}
+
+FutureMap
+    ParallelIdentity::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void ParallelIdentity::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    ParallelIdentity::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
+}
+
+bool ParallelIdentity::measure_operator_cost(Simulator *sim,
+                                             MachineView const &pc,
+                                             CostMetrics &cost_metrics) const {
+  cost_metrics = CostMetrics();
+  cost_metrics.forward_time = 0.0f;
+  cost_metrics.backward_time = 0.0f;
+
+  cost_metrics.sync_time = 0;
+  cost_metrics.inputs_memory = 0;
+  cost_metrics.outputs_memory = 0;
+  cost_metrics.weights_memory = 0;
+  return true;
+}
+
+bool ParallelIdentity::get_int_parameter(PMParameter para, int *value) const {
+  switch (para) {
+    case PM_PARALLEL_IDENTITY_DIM:
+      *value = parallel_identity_dim;
+      return true;
+    default:
+      return Op::get_int_parameter(para, value);
+  }
+}
+
+bool ParallelIdentity::append_parallel_op_info(
+    std::vector<ParallelOpInfo> &parallel_ops) const {
+  ParallelOpInfo ret;
+  ret.op_type = op_type;
+  ret.parallel_dim = parallel_identity_dim;
+  ret.parallel_degree = -1; // ParallelIdentity does not affect parallel degree
+  parallel_ops.push_back(ret);
+  return true;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::ParallelIdentityParams>::operator()(
+    FlexFlow::ParallelIdentityParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.parallel_identity_legion_dim);
+  return key;
+}
+
+} // namespace std
diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc
index e6ab09d088..fddf739599 100644
--- a/src/parallel_ops/partition.cc
+++ b/src/parallel_ops/partition.cc
@@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Repartition;
 /* Params */
 bool operator==(RepartitionParams const &lhs, RepartitionParams const &rhs) {
   return lhs.repartition_legion_dim == rhs.repartition_legion_dim &&
-         lhs.repartition_degree == rhs.repartition_degree;
+         lhs.repartition_degree == rhs.repartition_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool RepartitionParams::is_valid(ParallelTensorShape const &input) const {
@@ -60,7 +61,7 @@ RepartitionParams Repartition::get_params() const {
   RepartitionParams params;
   params.repartition_legion_dim = this->repartition_dim;
   params.repartition_degree = this->repartition_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -200,6 +201,11 @@ void Repartition::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 FutureMap
diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc
index 5ca2b1301c..7306e04334 100644
--- a/src/parallel_ops/reduction.cc
+++ b/src/parallel_ops/reduction.cc
@@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::Reduction;
 /* Params */
 bool operator==(ReductionParams const &lhs, ReductionParams const &rhs) {
   return lhs.reduction_legion_dim == rhs.reduction_legion_dim &&
-         lhs.reduction_degree == rhs.reduction_degree;
+         lhs.reduction_degree == rhs.reduction_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool ReductionParams::is_valid(ParallelTensorShape const &input) const {
@@ -56,7 +57,7 @@ ReductionParams Reduction::get_params() const {
   ReductionParams params;
   params.reduction_legion_dim = this->reduction_dim;
   params.reduction_degree = this->reduction_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -125,6 +126,13 @@ void Reduction::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  // output_grad_lp is an aliased partitioning along the replica dim
+  ff.create_aliased_partition(batch_inputs[0]->num_dims,
+                              batch_inputs[0]->dims,
+                              reduction_dim,
+                              batch_inputs[0]->parallel_is,
+                              batch_outputs[0]->region_grad,
+                              inference_output_grad_lps[batch_outputs[0]]);
 }
 
 OpMeta *Reduction::init_task(Task const *task,
@@ -137,6 +145,7 @@ OpMeta *Reduction::init_task(Task const *task,
   meta->input_type[0] = reduct->inputs[0]->data_type;
   meta->output_type[0] = reduct->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, reduct->name);
   return meta;
 }
 
@@ -372,6 +381,10 @@ void Reduction::forward_task(Task const *task,
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
+  if (m->inference_debugging) {
+    std::cout << "INF " << m->op_name << std::endl;
+  }
+
   assert(input.data_type == output.data_type);
   if (input.data_type == DT_HALF) {
     forward_kernel<half>(input.get_half_ptr(),
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index ba7bb6677f..38215fc903 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Replicate;
 /* Params */
 bool operator==(ReplicateParams const &lhs, ReplicateParams const &rhs) {
   return lhs.replicate_legion_dim == rhs.replicate_legion_dim &&
-         lhs.replicate_degree == rhs.replicate_degree;
+         lhs.replicate_degree == rhs.replicate_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool ReplicateParams::is_valid(ParallelTensorShape const &input) const {
@@ -55,7 +56,7 @@ ReplicateParams Replicate::get_params() const {
   ReplicateParams params;
   params.replicate_legion_dim = this->replicate_dim;
   params.replicate_degree = this->replicate_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -125,6 +126,12 @@ void Replicate::create_input_partition_inference(
                               batch_outputs[0]->parallel_is,
                               batch_inputs[0]->region,
                               inference_input_lps[batch_inputs[0]]);
+  // output_grad_lp is a disjoint partition
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 OpMeta *Replicate::init_task(Task const *task,
@@ -137,6 +144,7 @@ OpMeta *Replicate::init_task(Task const *task,
   meta->input_type[0] = repl->inputs[0]->data_type;
   meta->output_type[0] = repl->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, repl->name);
   return meta;
 }
 
@@ -276,6 +284,51 @@ void Replicate::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap Replicate::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+
+  // Warning: we need to use batch_inputs[0] here, instead of the usual
+  // batch_outputs[0]
+  parallel_is = batch_inputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view;
+
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(
+      RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void Replicate::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -350,6 +403,9 @@ void Replicate::forward_task(Task const *task,
   assert(task->regions.size() == 2);
 
   ReplicateMeta const *m = *((ReplicateMeta **)task->local_args);
+  if (m->inference_debugging) {
+    std::cout << "INF " << m->op_name << std::endl;
+  }
 
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
@@ -381,6 +437,37 @@ void Replicate::forward_task(Task const *task,
   }
 }
 
+void Replicate::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  Domain output_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain input_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  // Currently only support the outter most dimension
+  for (int i = 0; i < output_grad_domain.get_dim() - 1; i++) {
+    assert(output_grad_domain.lo()[i] == input_grad_domain.lo()[i]);
+    assert(output_grad_domain.hi()[i] == input_grad_domain.hi()[i]);
+  }
+  size_t num_elements = input_grad_domain.get_volume();
+  size_t num_replicas = output_grad_domain.get_volume() / num_elements;
+  float const *output_grad_ptr = helperGetTensorPointerRO<float>(
+      regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  float *input_grad_ptr = helperGetTensorPointerRW<float>(
+      regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  ReplicateMeta const *m = *((ReplicateMeta **)task->local_args);
+  if (m->inference_debugging) {
+    std::cout << "BWD " << m->op_name << std::endl;
+  }
+
+  backward_kernel<float>(
+      output_grad_ptr, input_grad_ptr, num_elements, num_replicas);
+}
+
 void Replicate::backward_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 7989b0799e..4c339750c7 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -25,7 +25,35 @@ Legion::Logger log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_tokens(0) {
+void set_optimizer_tasks(OptimizerTasks &tasks,
+                         int max_training_steps,
+                         int completed_training_steps,
+                         int gradient_accumulation_steps) {
+  assert(max_training_steps > 0);
+  assert(completed_training_steps >= 0);
+  assert(gradient_accumulation_steps > 0);
+  assert(completed_training_steps < max_training_steps);
+  // Compute gradients should always be true
+  tasks.compute_gradients = true;
+
+  // Reset gradients to zero in the first iteration and after weight updates
+  tasks.reset_gradients_to_zero =
+      (completed_training_steps == 0) ||
+      (completed_training_steps % gradient_accumulation_steps == 0);
+
+  // Update weights every gradient_accumulation_steps
+  tasks.update_weights =
+      ((completed_training_steps + 1) % gradient_accumulation_steps == 0);
+
+  // Save updated weights only in the very last training step
+  tasks.save_updated_weights =
+      (completed_training_steps == max_training_steps - 1);
+  if (tasks.save_updated_weights) {
+    assert(tasks.update_weights);
+  }
+}
+
+BatchConfig::BatchConfig() : num_tokens(0), num_peft_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
     requestsInfo[i].first_token_depth_in_request = 0;
     requestsInfo[i].first_token_offset_in_batch = 0;
@@ -74,6 +102,14 @@ int BatchConfig::num_active_tokens() const {
   return num_tokens;
 }
 
+int BatchConfig::num_active_infr_tokens() const {
+  return num_tokens;
+}
+
+int BatchConfig::num_active_peft_tokens() const {
+  return num_peft_tokens;
+}
+
 /*static*/
 int BatchConfig::max_requests_per_batch() {
   return RequestManager::get_request_manager()->get_max_requests_per_batch();
@@ -107,8 +143,13 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
   os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
   // Current values
-  os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
+  os << "Number of active tokens: " << bc.num_active_tokens() << std::endl;
+  os << "Number of inference tokens: " << bc.num_active_infr_tokens()
+     << std::endl;
+  os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl;
   os << "Number of requests: " << bc.num_active_requests() << std::endl;
+  os << "Number of generation tokens: " << bc.num_generation_tokens
+     << std::endl;
 
   // Per-request info
   os << "Per-request info:\n";
@@ -121,9 +162,27 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    BatchConfig Req ID: "
+         << bc.requestsInfo[i].batch_config_request_id << std::endl;
+      os << "    Prompt phase: " << bc.requestsInfo[i].prompt_phase
+         << std::endl;
+      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
+      os << "    optimizer_tasks: {"
+         << "compute_gradients: " << std::boolalpha
+         << bc.requestsInfo[i].optimizer_tasks.compute_gradients
+         << ", reset_gradients_to_zero: "
+         << bc.requestsInfo[i].optimizer_tasks.reset_gradients_to_zero
+         << ", update_weights: "
+         << bc.requestsInfo[i].optimizer_tasks.update_weights
+         << ", save_updated_weights: "
+         << bc.requestsInfo[i].optimizer_tasks.save_updated_weights << "}"
+         << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
     }
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index 0509c23afe..b10f8e82ab 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -137,6 +137,10 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 57bc5a0458..386a0c940b 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -36,7 +36,8 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 
 using FlexFlow::get_legion_stream;
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
@@ -271,18 +272,10 @@ __host__ void print_beam_tensor(T const *ptr,
 template <>
 __host__ void
     save_tensor(float const *ptr, size_t num_elements, char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  float *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(float) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(float) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  float *host_ptr = (float *)calloc(num_elements, sizeof(float));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(float) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -293,26 +286,17 @@ __host__ void
       fprintf(tensor_file, "%.9f", host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <>
 __host__ void
     save_tensor(half const *ptr, size_t num_elements, char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  half *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(half) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(half) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  half *host_ptr = (half *)calloc(num_elements, sizeof(half));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(half) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -323,27 +307,18 @@ __host__ void
       fprintf(tensor_file, "%.9f", (float)host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <>
 __host__ void save_tensor(int32_t const *ptr,
                           size_t num_elements,
                           char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  int32_t *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(int32_t) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(int32_t) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  int32_t *host_ptr = (int32_t *)calloc(num_elements, sizeof(int32_t));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(int32_t) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -354,27 +329,18 @@ __host__ void save_tensor(int32_t const *ptr,
       fprintf(tensor_file, "%d", host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <>
 __host__ void save_tensor(int64_t const *ptr,
                           size_t num_elements,
                           char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  int64_t *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(int64_t) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(int64_t) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  int64_t *host_ptr = (int64_t *)calloc(num_elements, sizeof(int64_t));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(int64_t) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -385,13 +351,12 @@ __host__ void save_tensor(int64_t const *ptr,
       fprintf(tensor_file, "%ld", host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <typename T>
-__host__ T *download_tensor(T const *ptr, size_t num_elements) {
+__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
@@ -404,14 +369,25 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) {
 }
 
 template <typename T>
-__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) {
+__host__ void
+    copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
   checkCUDA(cudaMemcpyAsync(
       dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream));
-  return true;
 }
+
+template <typename T>
+__host__ void
+    copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(src != nullptr);
+  checkCUDA(cudaMemcpyAsync(
+      dst, src, sizeof(T) * num_elements, cudaMemcpyHostToDevice, stream));
+}
+
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(
     cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) {
   int dims[MAX_TENSOR_DIM];
@@ -609,6 +585,48 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) {
   return CUDNN_DATA_FLOAT;
 }
 
+void check_device_vs_host_ptr(void const *maybe_devicePtr) {
+  cudaPointerAttributes attributes;
+  cudaError_t cudaStatus =
+      cudaPointerGetAttributes(&attributes, maybe_devicePtr);
+
+  if (cudaStatus == cudaSuccess) {
+    // Check attributes and perform actions accordingly
+    if (attributes.type == cudaMemoryTypeDevice) {
+      printf("Pointer is allocated in device memory.\n");
+    } else if (attributes.type == cudaMemoryTypeHost) {
+      printf("Pointer is allocated in host memory.\n");
+    } else if (attributes.type == cudaMemoryTypeUnregistered) {
+      printf("Pointer is unregistered.\n");
+    } else if (attributes.type == cudaMemoryTypeManaged) {
+      printf("Pointer is managed.\n");
+    } else {
+      printf("Pointer is not allocated in recognized memory type.\n");
+    }
+  } else {
+    fprintf(stderr,
+            "cudaPointerGetAttributes failed: %s\n",
+            cudaGetErrorString(cudaStatus));
+  }
+}
+
+void check_ptr_alignment(void const *ptr) {
+  if (!ptr) {
+    printf("Pointer is NULL\n");
+    return;
+  }
+  bool aligned2 = ((uintptr_t)ptr % 2 == 0);
+  bool aligned4 = ((uintptr_t)ptr % 4 == 0);
+  bool aligned8 = ((uintptr_t)ptr % 8 == 0);
+  bool aligned16 = ((uintptr_t)ptr % 16 == 0);
+  printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n",
+         ptr,
+         (aligned2 ? "yes" : "no"),
+         (aligned4 ? "yes" : "no"),
+         (aligned8 ? "yes" : "no"),
+         (aligned16 ? "yes" : "no"));
+}
+
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
 template __global__ void
@@ -620,6 +638,13 @@ template __global__ void
 template __global__ void
     assign_kernel<int64_t>(int64_t *ptr, coord_t size, int64_t value);
 
+template __global__ void
+    scale_kernel<half>(half *ptr, coord_t size, half a, half b);
+template __global__ void
+    scale_kernel<float>(float *ptr, coord_t size, float a, float b);
+template __global__ void
+    scale_kernel<double>(double *ptr, coord_t size, double a, double b);
+
 template __global__ void
     add_kernel<half>(half *dst, half const *src, size_t size);
 template __global__ void
@@ -716,26 +741,43 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
-template __host__ float *download_tensor<float>(float const *ptr,
-                                                size_t num_elements);
-template __host__ half *download_tensor<half>(half const *ptr,
-                                              size_t num_elements);
-template __host__ double *download_tensor<double>(double const *ptr,
-                                                  size_t num_elements);
-template __host__ int32_t *download_tensor<int32_t>(int32_t const *ptr,
-                                                    size_t num_elements);
-template __host__ int64_t *download_tensor<int64_t>(int64_t const *ptr,
-                                                    size_t num_elements);
-template __host__ bool
-    download_tensor<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ bool
-    download_tensor<half>(half const *ptr, half *dst, size_t num_elements);
-template __host__ bool download_tensor<double>(double const *ptr,
-                                               double *dst,
-                                               size_t num_elements);
-template __host__ bool download_tensor<int32_t>(int32_t const *ptr,
-                                                int32_t *dst,
-                                                size_t num_elements);
-template __host__ bool download_tensor<int64_t>(int64_t const *ptr,
-                                                int64_t *dst,
-                                                size_t num_elements);
+template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
+                                                        size_t num_elements);
+template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
+                                                      size_t num_elements);
+template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
+                                                          size_t num_elements);
+template __host__ int32_t *
+    copy_tensor_dev_to_host<int32_t>(int32_t const *ptr, size_t num_elements);
+template __host__ int64_t *
+    copy_tensor_dev_to_host<int64_t>(int64_t const *ptr, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<float>(float const *ptr,
+                                                      float *dst,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<half>(half const *ptr,
+                                                     half *dst,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
+                                                       double *dst,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
+                                                        int32_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
+                                                        int64_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst,
+                                                      float const *src,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst,
+                                                     half const *src,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst,
+                                                       double const *src,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst,
+                                                        int32_t const *src,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst,
+                                                        int64_t const *src,
+                                                        size_t num_elements);
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index c7b6e1257a..5a7d98b4dc 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -188,6 +188,9 @@ std::string get_operator_type_name(OperatorType type) {
       return "Sampling";
     case OP_ARGMAX:
       return "ArgMax";
+    // PEFT Ops
+    case OP_LORA:
+      return "Lora Layer";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
@@ -199,6 +202,8 @@ std::string get_operator_type_name(OperatorType type) {
       return "Reduction";
     case OP_ALLREDUCE:
       return "AllReduce";
+    case OP_PARALLEL_IDENTITY:
+      return "ParallelIdentity";
     case OP_PIPELINE:
       return "Pipeline";
     case OP_FUSED_PARALLEL:
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 819e6527e5..8213726e8a 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -30,4 +30,29 @@ bool operator==(LayerID const &lhs, LayerID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+const PEFTModelID PEFTModelID::NO_ID = PEFTModelID();
+
+PEFTModelID::PEFTModelID() : id(0) {}
+
+PEFTModelID::PEFTModelID(size_t _id) : id(_id) {
+  assert(is_valid_id());
+}
+
+bool PEFTModelID::is_valid_id() const {
+  return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID);
+}
+
+bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
+  return lhs.id == rhs.id;
+}
+
+std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
+  if (peft_model_id == PEFTModelID::NO_ID) {
+    os << "NO_ID";
+  } else {
+    os << peft_model_id.id;
+  }
+  return os;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 43ce9d7005..c373e0da9b 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string q_file = layer_name + "_wq_bias";
-  std::string k_file = layer_name + "_wk_bias";
-  std::string v_file = layer_name + "_wv_bias";
+  std::string q_file = layer_name + ".q_proj.bias";
+  std::string k_file = layer_name + ".k_proj.bias";
+  std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
   if (final_bias) {
-    std::string o_file = layer_name + "_wo_bias";
+    std::string o_file = layer_name + ".o_proj.bias";
     bias_files.push_back(o_file);
   }
 
@@ -217,12 +217,10 @@ void load_attention_weights_v2(DT *ptr,
                                std::string weights_folder,
                                size_t volume,
                                int tensor_parallelism_degree) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
@@ -407,12 +405,10 @@ void load_attention_weights_quantized(char *ptr,
                                       std::string weights_folder,
                                       DataType data_type,
                                       bool use_full_precision) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
 
   int file_index = 0;
@@ -690,7 +686,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     if (weight_idx > 0) {
       assert(weight_idx == 0 || weight_idx == 1);
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
     load_from_quantized_file(data,
@@ -734,44 +730,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-      if (weight_filename.find("self_attention") != std::string::npos) {
-        load_attention_weights_multi_query(
-            data, weight_filename, weights_folder, hidden_dim, num_heads);
-      } else if (weight_filename.find("attention") != std::string::npos &&
-                 weight_filename.rfind("attention") ==
-                     weight_filename.length() - strlen("attention")) {
-        if (weight_idx == 0) {
-          load_attention_weights_v2(data,
-                                    num_heads,
-                                    num_kv_heads,
-                                    hidden_dim,
-                                    qkv_inner_dim,
-                                    weight_filename,
-                                    weights_folder,
-                                    volume,
-                                    tensor_parallelism_degree);
-        } else {
-          long long value;
-          l->get_int_property("final_bias", value);
-          bool final_bias = (bool)value;
-          load_attention_bias_v2(data,
-                                 num_heads,
-                                 num_kv_heads,
-                                 hidden_dim,
-                                 qkv_inner_dim,
-                                 final_bias,
-                                 weight_filename,
-                                 weights_folder);
-        }
-
+      if (weight_idx == 0) {
+        load_attention_weights_v2(data,
+                                  num_heads,
+                                  num_kv_heads,
+                                  hidden_dim,
+                                  qkv_inner_dim,
+                                  weight_filename,
+                                  weights_folder,
+                                  volume,
+                                  tensor_parallelism_degree);
       } else {
-        assert(false);
+        long long value;
+        l->get_int_property("final_bias", value);
+        bool final_bias = (bool)value;
+        load_attention_bias_v2(data,
+                               num_heads,
+                               num_kv_heads,
+                               hidden_dim,
+                               qkv_inner_dim,
+                               final_bias,
+                               weight_filename,
+                               weights_folder);
       }
     } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
       assert(weight_idx >= 0 || weight_idx <= 2);
       weight_filename += (weight_idx == 0)
-                             ? "_attn_bias"
-                             : ((weight_idx == 1) ? "_weight" : "_bias");
+                             ? ".attn_bias"
+                             : ((weight_idx == 1) ? ".weight" : ".bias");
       std::cout << "Loading weight file " << weight_filename << std::endl;
       std::string weight_filepath =
           join_path({weights_folder, weight_filename});
@@ -781,7 +767,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
       assert(weight_idx == 0 || weight_idx == 1);
       // handle exception
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
       std::cout << "Loading weight file " << weight_filename << std::endl;
       std::string weight_filepath =
@@ -809,6 +795,10 @@ void FileDataLoader::load_weights(FFModel *ff) {
       if (weight == NULL) {
         continue;
       }
+      // TODO: currently skip Lora layers
+      if (l->op_type == OP_LORA) {
+        continue;
+      }
       switch (weight->data_type) {
         case DT_HALF:
           load_single_weight_tensor<half>(ff, l, i);
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index b023aced6e..1a38782e81 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -36,6 +36,7 @@
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/ops/pool_2d.h"
 #include "flexflow/ops/reduce.h"
@@ -54,6 +55,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -1992,6 +1994,7 @@ std::pair<std::unique_ptr<Graph>, std::unordered_map<Node, MachineView>>
         mv.device_type = MachineView::GPU;
         mv.ndims = 1;
         int total_parallel_degree = 1;
+        assert(op->numOutputs > 0);
         for (int i = 0; i < op->outputs[0]->num_dims; i++) {
           total_parallel_degree *= op->outputs[0]->dims[i].degree;
         }
@@ -2434,6 +2437,13 @@ GraphOptimalViewSerialized
         sez.serialize(allreduce->name, strlen(allreduce->name));
         break;
       }
+      case OP_PARALLEL_IDENTITY: {
+        ParallelIdentity *parallel_identity = (ParallelIdentity *)op;
+        sez.serialize(parallel_identity->parallel_identity_dim);
+        sez.serialize(strlen(parallel_identity->name));
+        sez.serialize(parallel_identity->name, strlen(parallel_identity->name));
+        break;
+      }
       case OP_FUSED_PARALLEL: {
         FusedParallelOp *fused = (FusedParallelOp *)op;
         sez.serialize(fused->num_parallel_ops);
@@ -2475,6 +2485,7 @@ namespace FlexFlow {
 using PCG::Edge;
 using PCG::Graph;
 using PCG::GraphCostResult;
+using PCG::log_graph;
 using PCG::Node;
 
 void FFModel::register_all_machine_views(
@@ -2759,6 +2770,10 @@ void FFModel::deserialize_graph_optimal_view(
         node = Linear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
+      case OP_LORA: {
+        node = LoraLinear::deserialize(*this, dez, inputs, num_inputs);
+        break;
+      }
       case OP_MULTIHEAD_ATTENTION: {
         assert(num_inputs == 3);
         int embed_dim, num_heads, k_dim, v_dim;
@@ -3042,8 +3057,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Combine>(inputs[0],
-                                           {combine_dim, combine_degree});
+        CombineParams params;
+        params.combine_legion_dim = combine_dim;
+        params.combine_degree = combine_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Combine>(inputs[0], params);
         break;
       }
       case OP_REPARTITION: {
@@ -3055,8 +3073,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Repartition>(
-            inputs[0], {repartition_dim, repartition_degree});
+        RepartitionParams params;
+        params.repartition_legion_dim = repartition_dim;
+        params.repartition_degree = repartition_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Repartition>(inputs[0], params);
         break;
       }
       case OP_REPLICATE: {
@@ -3068,8 +3089,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Replicate>(inputs[0],
-                                             {replicate_dim, replicate_degree});
+        ReplicateParams params;
+        params.replicate_legion_dim = replicate_dim;
+        params.replicate_degree = replicate_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Replicate>(inputs[0], params);
         break;
       }
       case OP_REDUCTION: {
@@ -3081,8 +3105,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Reduction>(inputs[0],
-                                             {reduction_dim, reduction_degree});
+        ReductionParams params;
+        params.reduction_legion_dim = reduction_dim;
+        params.reduction_degree = reduction_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Reduction>(inputs[0], params);
         break;
       }
       case OP_ALLREDUCE: {
@@ -3093,24 +3120,43 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<AllReduce>(inputs[0], {allreduce_dim});
+        AllReduceParams params;
+        params.allreduce_legion_dim = allreduce_dim;
+        strcpy(params.name, name);
+        node = get_or_create_node<AllReduce>(inputs[0], params);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(num_inputs == 1);
+        int parallel_identity_dim;
+        dez.deserialize(parallel_identity_dim);
+        size_t name_len;
+        char name[MAX_OPNAME] = {0};
+        dez.deserialize(name_len);
+        dez.deserialize(name, name_len);
+        ParallelIdentityParams params;
+        params.parallel_identity_legion_dim = parallel_identity_dim;
+        strcpy(params.name, name);
+        node = get_or_create_node<ParallelIdentity>(inputs[0], params);
         break;
       }
       case OP_FUSED_PARALLEL: {
         assert(num_inputs == 1);
-        std::vector<ParallelOpInfo> parallel_ops;
+        FusedParallelOpParams params;
         int num_parallel_ops;
         dez.deserialize(num_parallel_ops);
         for (int i = 0; i < num_parallel_ops; i++) {
           ParallelOpInfo info;
           dez.deserialize(info);
-          parallel_ops.push_back(info);
+          params.parallel_ops.push_back(info);
         }
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<FusedParallelOp>(inputs[0], {parallel_ops});
+        strcpy(params.name, name);
+
+        node = get_or_create_node<FusedParallelOp>(inputs[0], params);
         break;
       }
       default: {
@@ -3149,20 +3195,20 @@ void FFModel::deserialize_graph_optimal_view(
     optimal_views[guid_to_nodes[guid]] = view;
   }
   assert(dez.get_remaining_bytes() == 0);
-  printf("Deserialized Views...\n");
+  log_graph.debug("Deserialized Views...\n");
   for (auto const &it : optimal_views) {
-    printf("node[%zu]: type(%s) view(%d %d %d) ",
-           it.first.guid,
-           it.first.to_string().c_str(),
-           it.second.ndims,
-           it.second.dim[0],
-           it.second.start_device_id);
+    log_graph.debug("node[%zu]: type(%s) view(%d %d %d) ",
+                    it.first.guid,
+                    it.first.to_string().c_str(),
+                    it.second.ndims,
+                    it.second.dim[0],
+                    it.second.start_device_id);
     auto const &list = graph->inEdges.at(it.first);
     for (auto const &it2 : list) {
       Edge e = it2;
-      printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
+      log_graph.debug(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
     }
-    printf("\n");
+    log_graph.debug("\n");
   }
 }
 
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index 613df1cbcf..057be8f443 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -29,7 +29,8 @@ hipError_t get_legion_stream(hipStream_t *stream) {
 
 using FlexFlow::get_legion_stream;
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
@@ -55,6 +56,14 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
   }
 }
 
+template <typename DT>
+__global__ void
+    copy_kernel_discrete(DT *dst, const DT *src, coord_t size, size_t *index) {
+  CUDA_KERNEL_LOOP(i, size) {
+    dst[i] = src[index[i]];
+  }
+}
+
 template <typename DT>
 __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) {
   CUDA_KERNEL_LOOP(i, n) {
@@ -224,22 +233,24 @@ __host__ void updateGAS(float *para_ptr,
 }
 
 template <typename T>
-__host__ void
-    print_tensor(T const *ptr, size_t num_elements, char const *prefix) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ void print_tensor(T const *ptr,
+                           size_t num_elements,
+                           char const *prefix,
+                           int shard_id) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
-  checkCUDA(hipHostMalloc((void **)&host_ptr,
+  checkCUDA(hipHostMalloc(&host_ptr,
                           sizeof(T) * num_elements,
                           hipHostMallocPortable | hipHostMallocMapped));
-  checkCUDA(hipMemcpy(
-      host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost));
-  // checkCUDA(hipDeviceSynchronize());
+  checkCUDA(hipMemcpyAsync(
+      host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
+  checkCUDA(hipDeviceSynchronize());
   int idx = 0;
-  printf("%s", prefix);
+  printf("%s, %d---->", prefix, shard_id);
   for (idx = 0; idx < num_elements; idx++) {
-    printf(" %.4lf", (float)host_ptr[idx]);
-    if (idx >= 16) {
+    printf(" %.20lf", (float)host_ptr[idx]);
+    if (idx >= 100) {
       break;
     }
   }
@@ -247,6 +258,40 @@ __host__ void
   checkCUDA(hipHostFree(host_ptr));
 }
 
+template <typename T>
+__host__ void print_beam_tensor(T const *ptr,
+                                size_t num_elements,
+                                int skip,
+                                int channel,
+                                char const *prefix) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  T *host_ptr;
+  checkCUDA(hipHostMalloc(&host_ptr,
+                          sizeof(T) * channel * skip,
+                          hipHostMallocPortable | hipHostMallocMapped));
+  checkCUDA(hipMemcpyAsync(host_ptr,
+                           ptr,
+                           sizeof(T) * channel * skip,
+                           hipMemcpyDeviceToHost,
+                           stream));
+  // checkCUDA(hipDeviceSynchronize());
+  int idx = 0;
+  printf("%s", prefix);
+
+  for (int i = 0; i < channel; i += 1) {
+    for (idx = 0; idx < num_elements; idx++) {
+      printf(" %.20lf", (float)host_ptr[idx + i * skip]);
+      if (idx >= 100) {
+        break;
+      }
+    }
+    printf("\n-----***********------\n");
+  }
+
+  checkCUDA(hipHostFree(host_ptr));
+}
+
 template <>
 __host__ void
     save_tensor(float const *ptr, size_t num_elements, char const *file_name) {
@@ -370,9 +415,7 @@ __host__ void save_tensor(int64_t const *ptr,
 }
 
 template <typename T>
-__host__ T *download_tensor(T const *ptr, size_t num_elements) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
@@ -381,21 +424,27 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) {
                           hipHostMallocPortable | hipHostMallocMapped));
   checkCUDA(hipMemcpyAsync(
       host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
-  // checkCUDA(hipDeviceSynchronize());
   return host_ptr;
 }
 
 template <typename T>
-__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ void
+    copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
   checkCUDA(hipMemcpyAsync(
       dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
-  // checkCUDA(hipDeviceSynchronize());
-  return true;
+}
+
+template <typename T>
+__host__ void
+    copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(src != nullptr);
+  checkCUDA(hipMemcpyAsync(
+      dst, src, sizeof(T) * num_elements, hipMemcpyHostToDevice, stream));
 }
 
 miopenStatus_t cudnnSetTensorDescriptorFromDomain(
@@ -450,22 +499,23 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain(
   return miopenStatusBadParm;
 }
 
-miopenStatus_t
-    cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor,
-                                               Domain domain) {
+miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(
+    miopenTensorDescriptor_t tensor, Domain domain, DataType data_type) {
   int dims[MAX_TENSOR_DIM];
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type);
   switch (domain.get_dim()) {
     case 1: {
       Rect<1> rect = domain;
       dims[0] = rect.hi[0] - rect.lo[0] + 1;
-      return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1);
+      return miopenSet4dTensorDescriptor(
+          tensor, cudnn_data_type, dims[0], 1, 1, 1);
     }
     case 2: {
       Rect<2> rect = domain;
       dims[0] = rect.hi[0] - rect.lo[0] + 1;
       dims[1] = rect.hi[1] - rect.lo[1] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[1], dims[0], 1, 1);
+          tensor, cudnn_data_type, dims[1], dims[0], 1, 1);
     }
     case 3: {
       Rect<3> rect = domain;
@@ -473,7 +523,7 @@ miopenStatus_t
       dims[1] = rect.hi[1] - rect.lo[1] + 1;
       dims[2] = rect.hi[2] - rect.lo[2] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1);
+          tensor, cudnn_data_type, dims[2] * dims[1], dims[0], 1, 1);
     }
     case 4: {
       Rect<4> rect = domain;
@@ -482,7 +532,7 @@ miopenStatus_t
       dims[2] = rect.hi[2] - rect.lo[2] + 1;
       dims[3] = rect.hi[3] - rect.lo[3] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1);
+          tensor, cudnn_data_type, dims[3] * dims[2] * dims[1], dims[0], 1, 1);
     }
     case 5: {
       Rect<5> rect = domain;
@@ -493,7 +543,7 @@ miopenStatus_t
       dims[2] = rect.hi[2] - rect.lo[2] + 1;
       dims[3] = rect.hi[3] - rect.lo[3] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]);
+          tensor, cudnn_data_type, dims[3], dims[2], dims[1], dims[0]);
     }
     default:
       assert(false && "Unsupported dim number");
@@ -553,6 +603,49 @@ void handle_unimplemented_hip_kernel(OperatorType op_type) {
   throw std::runtime_error("Unimplemented hip kernel for Operator: " +
                            FlexFlow::get_operator_type_name(op_type));
 }
+void check_device_vs_host_ptr(void const *maybe_devicePtr) {
+  hipPointerAttribute_t attributes;
+  hipError_t hipStatus = hipPointerGetAttributes(&attributes, maybe_devicePtr);
+
+  if (hipStatus == hipSuccess) {
+    // Check attributes and perform actions accordingly
+    if (attributes.memoryType == hipMemoryTypeDevice) {
+      printf("Pointer is allocated in device memory.\n");
+    } else if (attributes.memoryType == hipMemoryTypeHost) {
+      printf("Pointer is allocated in host memory.\n");
+    } else if (attributes.memoryType == hipMemoryTypeArray) {
+      printf("Pointer points to array memory, physically located on device.\n");
+    } else if (attributes.memoryType == hipMemoryTypeManaged) {
+      printf("Pointer points to managed memory, automaticallly managed by the "
+             "unified memory system.\n");
+    } else if (attributes.memoryType == hipMemoryTypeUnified) {
+      printf("Pointer points to unified memory (not supported currently) \n");
+    } else {
+      printf("Pointer is not allocated in recognized memory type.\n");
+    }
+  } else {
+    fprintf(stderr,
+            "hipPointerGetAttributes failed: %s\n",
+            hipGetErrorString(hipStatus));
+  }
+}
+
+void check_ptr_alignment(void const *ptr) {
+  if (!ptr) {
+    printf("Pointer is NULL\n");
+    return;
+  }
+  bool aligned2 = ((uintptr_t)ptr % 2 == 0);
+  bool aligned4 = ((uintptr_t)ptr % 4 == 0);
+  bool aligned8 = ((uintptr_t)ptr % 8 == 0);
+  bool aligned16 = ((uintptr_t)ptr % 16 == 0);
+  printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n",
+         ptr,
+         (aligned2 ? "yes" : "no"),
+         (aligned4 ? "yes" : "no"),
+         (aligned8 ? "yes" : "no"),
+         (aligned16 ? "yes" : "no"));
+}
 
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
@@ -565,6 +658,13 @@ template __global__ void
 template __global__ void
     assign_kernel<int64_t>(int64_t *ptr, coord_t size, int64_t value);
 
+template __global__ void
+    scale_kernel<half>(half *ptr, coord_t size, half a, half b);
+template __global__ void
+    scale_kernel<float>(float *ptr, coord_t size, float a, float b);
+template __global__ void
+    scale_kernel<double>(double *ptr, coord_t size, double a, double b);
+
 template __global__ void
     add_kernel<half>(half *dst, half const *src, size_t size);
 template __global__ void
@@ -587,6 +687,15 @@ template __global__ void
 template __global__ void
     copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
 
+template __global__ void copy_kernel_discrete<float>(float *dst,
+                                                     float const *src,
+                                                     coord_t size,
+                                                     size_t *index);
+template __global__ void copy_kernel_discrete<int64_t>(int64_t *dst,
+                                                       int64_t const *src,
+                                                       coord_t size,
+                                                       size_t *index);
+
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
                                                      size_t size,
@@ -604,16 +713,42 @@ template __global__ void apply_add_with_scale<int64_t>(int64_t *data_ptr,
                                                        size_t size,
                                                        int64_t scale);
 
-template __host__ void
-    print_tensor<float>(float const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<double>(double const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<int32_t>(int32_t const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<int64_t>(int64_t const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<half>(half const *ptr, size_t rect, char const *prefix);
+template __host__ void print_tensor<float>(float const *ptr,
+                                           size_t rect,
+                                           char const *prefix,
+                                           int shard_id);
+template __host__ void print_tensor<double>(double const *ptr,
+                                            size_t rect,
+                                            char const *prefix,
+                                            int shard_id);
+template __host__ void print_tensor<int32_t>(int32_t const *ptr,
+                                             size_t rect,
+                                             char const *prefix,
+                                             int shard_id);
+template __host__ void print_tensor<int64_t>(int64_t const *ptr,
+                                             size_t rect,
+                                             char const *prefix,
+                                             int shard_id);
+template __host__ void print_tensor<half>(half const *ptr,
+                                          size_t rect,
+                                          char const *prefix,
+                                          int shard_id);
+
+template __host__ void print_beam_tensor<float>(float const *ptr,
+                                                size_t num_elements,
+                                                int skip,
+                                                int channel,
+                                                char const *prefix);
+template __host__ void print_beam_tensor<int32_t>(int32_t const *ptr,
+                                                  size_t num_elements,
+                                                  int skip,
+                                                  int channel,
+                                                  char const *prefix);
+template __host__ void print_beam_tensor<int64_t>(int64_t const *ptr,
+                                                  size_t num_elements,
+                                                  int skip,
+                                                  int channel,
+                                                  char const *prefix);
 
 template __host__ void
     save_tensor<float>(float const *ptr, size_t rect, char const *file_name);
@@ -626,24 +761,43 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
-template __host__ float *download_tensor<float>(float const *ptr,
-                                                size_t num_elements);
-template __host__ half *download_tensor<half>(half const *ptr,
-                                              size_t num_elements);
-template __host__ double *download_tensor<double>(double const *ptr,
-                                                  size_t num_elements);
-template __host__ int32_t *download_tensor<int32_t>(int32_t const *ptr,
-                                                    size_t num_elements);
-template __host__ int64_t *download_tensor<int64_t>(int64_t const *ptr,
-                                                    size_t num_elements);
-template __host__ bool
-    download_tensor<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ bool download_tensor<double>(double const *ptr,
-                                               double *dst,
-                                               size_t num_elements);
-template __host__ bool download_tensor<int32_t>(int32_t const *ptr,
-                                                int32_t *dst,
-                                                size_t num_elements);
-template __host__ bool download_tensor<int64_t>(int64_t const *ptr,
-                                                int64_t *dst,
-                                                size_t num_elements);
+template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
+                                                        size_t num_elements);
+template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
+                                                      size_t num_elements);
+template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
+                                                          size_t num_elements);
+template __host__ int32_t *
+    copy_tensor_dev_to_host<int32_t>(int32_t const *ptr, size_t num_elements);
+template __host__ int64_t *
+    copy_tensor_dev_to_host<int64_t>(int64_t const *ptr, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<float>(float const *ptr,
+                                                      float *dst,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<half>(half const *ptr,
+                                                     half *dst,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
+                                                       double *dst,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
+                                                        int32_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
+                                                        int64_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst,
+                                                      float const *src,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst,
+                                                     half const *src,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst,
+                                                       double const *src,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst,
+                                                        int32_t const *src,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst,
+                                                        int64_t const *src,
+                                                        size_t num_elements);
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 3d299aeedd..1b65dfd869 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -54,10 +54,31 @@ bool parallel_tensor_list_overlaps(std::vector<ParallelTensor> const &list1,
 }
 
 void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
+
+  // Check if the model object exists
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl;
+  }
+
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
   model->config.batchSize = BatchConfig::max_tokens_per_batch();
+
+  // Check if the model object exists after importing config
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist after "
+                 "setting config and batch size."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl;
+  }
+
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
@@ -117,7 +138,28 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     for (int i = 0; i < op->numOutputs; i++) {
       ParallelTensor pt_base = op->outputs[i];
       assert(tensor_buffer.find(pt_base) == tensor_buffer.end());
-
+      // no need to map inplace tensor
+      // A tensor is inplace if it shares the same region as another tensor
+      {
+        bool inplace = false;
+        for (int j = 0; j < op->numInputs; j++) {
+          if (op->inputs[j]->region == op->outputs[i]->region) {
+            assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
+            tensor_buffer[pt_base] = tensor_buffer[op->inputs[j]];
+            inplace = true;
+          }
+        }
+        for (int j = 0; j < i; j++) {
+          if (op->outputs[j]->region == op->outputs[i]->region) {
+            assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end());
+            tensor_buffer[pt_base] = tensor_buffer[op->outputs[j]];
+            inplace = true;
+          }
+        }
+        if (inplace) {
+          continue;
+        }
+      }
       if (op->op_type == OP_REPLICATE) {
         assert(op->numInputs == 1 && op->numOutputs == 1);
       }
@@ -175,7 +217,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
           }
         }
         if (!found_parallel_tensor) {
-          log_offload.print(
+          log_offload.debug(
               "Cannot find a previous tensor for operator(%d) output_idx(%d)",
               op_idx,
               i);
@@ -191,6 +233,13 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
                                              pt_base->region.get_field_space());
           pt->part = runtime->get_logical_partition(
               ctx, pt->region, pt_base->part.get_index_partition());
+
+          pt->region_grad =
+              runtime->create_logical_region(ctx,
+                                             pt_base->region.get_index_space(),
+                                             pt_base->region.get_field_space());
+          pt->part_grad = runtime->get_logical_partition(
+              ctx, pt->region_grad, pt_base->part.get_index_partition());
           pt->machine_view = machine_views[j];
           // std::cout << "output mv: " << pt->machine_view << std::endl;
           Domain part_domain =
@@ -205,6 +254,30 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     // std::cout << std::endl;
   }
 
+  // Check whether we need to reset input grads
+  // We use a parallel tensor's region as the key
+  std::set<LogicalRegion> reset_inputs;
+  for (int l = model->operators.size() - 1; l >= 0; l--) {
+    Op *op = model->operators[l];
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i]->region != LogicalRegion::NO_REGION);
+      if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) {
+        // We should not reset input grads since other operators have already
+        // saved gradients into the region
+        op->reset_input_grads[i] = false;
+      } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM ||
+                            op->op_type == OP_RESIDUAL_RMS_NORM ||
+                            op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) {
+        if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) {
+          op->reset_input_grads[0] = false;
+        }
+        reset_inputs.insert(op->inputs[i]->region);
+      } else {
+        reset_inputs.insert(op->inputs[i]->region);
+      }
+    }
+  }
+
   // Perform fusion optimizations
   if (model->config.perform_fusion) {
     fprintf(stderr, "Applying fusion optimizations during compilation...\n");
@@ -235,34 +308,35 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
       continue;
     }
-    printf("operator[%zu]: type(%s) guid(%lu)\n",
-           i,
-           get_operator_type_name(model->operators[i]->op_type).c_str(),
-           model->operators[i]->op_guid);
+    log_inf_mgr.debug(
+        "operator[%zu]: type(%s) guid(%lu)\n",
+        i,
+        get_operator_type_name(model->operators[i]->op_type).c_str(),
+        model->operators[i]->op_guid);
     for (int j = 0; j < op->numInputs; j++) {
       assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
       LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
-      printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\tinputs[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
     for (int j = 0; j < op->numOutputs; j++) {
       LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
-      printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\toutputs[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
     for (int j = 0; j < op->numWeights; j++) {
       LogicalRegion handle = op->weights[j]->region;
-      printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\tweights[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
   }
 }
@@ -290,9 +364,9 @@ void InferenceManager::init_operators_inference(FFModel *model) {
         assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
         assert(tensor_buffer[op->outputs[i]].size() > batch_index);
         outputs[i] = tensor_buffer[op->outputs[i]][batch_index];
-        if (i > 0) {
-          assert(outputs[0]->machine_view == outputs[i]->machine_view);
-        }
+        // if (i > 0) {
+        //   assert(outputs[0]->machine_view == outputs[i]->machine_view);
+        // }
         assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
       }
       if (op->is_parallel_op()) {
@@ -332,11 +406,12 @@ FutureMap InferenceManager::inference(FFModel *model,
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfigFuture const &bc) {
-  // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)",
+  // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d)
+  // num_active_requests(%d)",
   //                   bc.get_mode(),
-  //                   bc.num_active_tokens(),
+  //                   bc.num_active_infr_tokens(),
   //                   bc.num_active_requests());
-  //  assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0);
+  //  assert(bc.num_active_infr_tokens() > 0 && bc.num_active_requests() > 0);
   //  We currently assume that the index-th batch will be placed
   //  on the device_index-th device (except for the experts layers)
   int batch_index = index % model->config.data_parallelism_degree;
@@ -390,6 +465,53 @@ FutureMap InferenceManager::inference(FFModel *model,
   return fm;
 };
 
+void InferenceManager::peft_bwd(FFModel *model,
+                                int index,
+                                BatchConfigFuture const &bc) {
+  int batch_index = index % model->config.data_parallelism_degree;
+  FutureMap fm;
+  bool found_input_operator = false;
+  int last_op = model->operators.size() - 1;
+  // Assert that the last operator must be argmax or sampling
+  assert(model->operators[last_op]->op_type == OP_ARGMAX ||
+         model->operators[last_op]->op_type == OP_ARG_TOPK ||
+         model->operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  for (int o = last_op; o >= 0; o--) {
+    Op *op = model->operators[o];
+    if (op->op_type == OP_WEIGHT) {
+      continue;
+    }
+    if (op->op_type == OP_INPUT) {
+      continue;
+    }
+    std::vector<ParallelTensor> inputs(op->numInputs);
+    std::vector<ParallelTensor> outputs(op->numOutputs);
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i] != nullptr);
+      assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      assert(tensor_buffer[op->inputs[i]].size() > batch_index);
+      inputs[i] = tensor_buffer[op->inputs[i]][batch_index];
+      assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    for (int i = 0; i < op->numOutputs; i++) {
+      assert(op->outputs[i] != nullptr);
+      assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      if (op->op_type == OP_INPUT &&
+          tensor_buffer[op->outputs[i]].size() == 0) {
+        continue;
+      }
+      assert(tensor_buffer[op->outputs[i]].size() > batch_index);
+      outputs[i] = tensor_buffer[op->outputs[i]][batch_index];
+      assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    op->peft_bwd(*model, bc, inputs, outputs);
+  }
+};
+
 void InferenceManager::load_input_tokens_from_batch_config(
     FFModel *model,
     BatchConfigFuture const &bc,
@@ -509,17 +631,26 @@ void FFModel::set_position_offset(int offset) {
 }
 
 void FFModel::compile_inference() {
+  std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl;
+
   // Request at least four CPU processors for inference runs
   assert(
       config.cpusPerNode >= 4 &&
       "FlexFlow Serve requires at least four CPU cores per node, please add "
       "`-ll:cpu 4` in the command line if you are using the C++ interface or "
       "set `num_cpus` in `ff.init` if you are using the Python interface");
+
+  std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four "
+               "CPU cores per node."
+            << std::endl;
   Context ctx = config.lg_ctx;
   Runtime *runtime = config.lg_hlr;
   config.computationMode = COMP_MODE_INFERENCE;
   create_operators_from_layers();
+
   // Launch the graph optimize task
+  std::cout << "###PEFT DEBUGGING### Launching graph optimization task."
+            << std::endl;
   {
     FFModel *model = this;
     TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID,
@@ -535,7 +666,7 @@ void FFModel::compile_inference() {
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
@@ -570,6 +701,14 @@ void FFModel::compile_inference() {
       }
     }
   }
+
+  std::cout
+      << "###PEFT DEBUGGING### Operators reconstructed from optimized graph."
+      << std::endl;
+  // Perform inplace optimizations
+  std::cout << "###PEFT DEBUGGING### Starting inplace optimizations."
+            << std::endl;
+
   loss_op = nullptr;
   metrics_op = nullptr;
   // Perform inplace optimizations
@@ -609,6 +748,8 @@ void FFModel::compile_inference() {
     }
   }
 
+  // Output tensor mapping
+  std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     Op *op = operators[l];
 
@@ -634,11 +775,14 @@ void FFModel::compile_inference() {
   }
 
 #ifdef FF_USE_NCCL
+  std::cout << "###PEFT DEBUGGING### Setting up NCCL communications."
+            << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     // Only create nccl for allreduce and fusedop for inference
     // (fusedop may include allreduces)
     if (operators[l]->op_type == OP_ALLREDUCE ||
-        operators[l]->op_type == OP_FUSED) {
+        operators[l]->op_type == OP_PARALLEL_IDENTITY ||
+        operators[l]->op_type == OP_LORA || operators[l]->op_type == OP_FUSED) {
       MachineView view = operators[l]->outputs[0]->machine_view;
       if (view_hash_to_nccl_comms.find(view.hash()) ==
           view_hash_to_nccl_comms.end()) {
@@ -670,6 +814,8 @@ void FFModel::compile_inference() {
     }
   }
 #endif
+  std::cout << "###PEFT DEBUGGING### compile_inference completed successfully."
+            << std::endl;
 }
 
 std::string join_path(std::vector<std::string> const &paths) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 4c67de1aa9..f46630db3c 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -47,6 +47,7 @@
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/ops/pool_2d.h"
 #include "flexflow/ops/reduce.h"
@@ -66,6 +67,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -77,6 +79,7 @@
 #include <dirent.h>
 #include <queue>
 #include <unordered_set>
+#include <wordexp.h>
 
 namespace FlexFlow {
 
@@ -135,19 +138,21 @@ Op::Op(FFModel &model,
   std::string pcname;
   if (_name == NULL) {
     pcname = get_operator_type_name(op_type);
+    pcname = pcname + "_" + std::to_string(op_guid);
   } else {
     pcname = std::string(_name);
   }
-  pcname = pcname + "_" + std::to_string(op_guid);
   assert(pcname.length() < MAX_OPNAME);
+  // std::cout << "Creating operator: " << pcname << std::endl;
   std::strcpy(name, pcname.c_str());
+  // std::cout << "copied name into name var: " << this->name << std::endl;
   for (int i = 0; i < numInputs; i++) {
     assert(tensors[i] != NULL);
     inputs[i] = tensors[i];
   }
   for (int i = 0; i < numInputs; i++) {
-    trainableInputs[i] = true;
-    // resetInputGrads[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_OUTPUTS; i++) {
     outputs[i] = nullptr;
@@ -191,8 +196,8 @@ Op::Op(FFModel &model,
     }
   }
   for (int i = 0; i < numInputs; i++) {
-    trainableInputs[i] = true;
-    // resetInputGrads[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_OUTPUTS; i++) {
     outputs[i] = NULL;
@@ -1245,7 +1250,8 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff,
     int idx = 0;                                                               \
     for (PointInRectIterator<DIM> it(rect); it(); it++) {                      \
       FFHandler handle = ff.handlers[view.get_device_id(*it)];                 \
-      if (op_type == OP_ALLREDUCE) {                                           \
+      if (op_type == OP_ALLREDUCE || op_type == OP_LORA ||                     \
+          op_type == OP_PARALLEL_IDENTITY) {                                   \
         ncclComm_t *nccl_comms = ff.find_nccl_comms(view);                     \
         handle.ncclComm = nccl_comms[idx++];                                   \
       }                                                                        \
@@ -1475,10 +1481,12 @@ bool Op::get_weight_parameter(TNParameter tnp,
   return true;
 }
 
+#ifdef DEADCODE
 OpMeta::OpMeta(FFHandler _handle)
     : handle(_handle), profiling(false), inference_debugging(false) {
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
-    trainableInputs[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
     input_type[i] = DT_NONE;
@@ -1490,9 +1498,17 @@ OpMeta::OpMeta(FFHandler _handle)
     output_type[i] = DT_NONE;
   }
   decoding_step = 0;
+  bwd_step = 0;
 }
+#endif
 
-OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) {
+OpMeta::OpMeta(FFHandler _handle, Op const *op)
+    : handle(_handle), profiling(op->profiling),
+      inference_debugging(op->inference_debugging) {
+  for (int i = 0; i < op->numInputs; i++) {
+    trainable_inputs[i] = op->trainable_inputs[i];
+    reset_input_grads[i] = op->reset_input_grads[i];
+  }
   for (int i = 0; i < op->numInputs; i++) {
     input_type[i] = op->inputs[i]->data_type;
   }
@@ -1503,6 +1519,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) {
     output_type[i] = op->outputs[i]->data_type;
   }
   decoding_step = 0;
+  bwd_step = 0;
 }
 
 FFRuntime::FFRuntime(FFConfig &config) {
@@ -1520,6 +1537,10 @@ FFRuntime::FFRuntime(FFConfig &config) {
     info.workSpaceSize = config.workSpaceSize;
     info.offload_reserve_space_size =
         config.cpu_offload ? config.offload_reserve_space_size : 0;
+    info.peft_activation_reserve_space_size =
+        config.enable_peft ? config.peft_activation_reserve_space_size : 0;
+    info.peft_weight_reserve_space_size =
+        config.enable_peft ? config.peft_weight_reserve_space_size : 0;
     info.quantization_type = config.quantization_type;
     info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion;
     argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo)));
@@ -1546,9 +1567,32 @@ FFRuntime *ffruntime_singleton = nullptr;
 
 int FFModel::model_counter = 0;
 
+void make_debug_dirs() {
+  char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+  std::string debug_dir_ =
+      ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow"
+                    : std::string("~/.cache/flexflow/debug/flexflow");
+  wordexp_t p;
+  wordexp(debug_dir_.c_str(), &p, 0);
+  debug_dir_ = p.we_wordv[0];
+  wordfree(&p);
+  fs::path debug_dir = debug_dir_;
+  if (fs::exists(debug_dir)) {
+    fs::remove_all(debug_dir);
+  }
+  fs::create_directories(debug_dir);
+  assert(fs::is_directory(debug_dir));
+  std::vector<std::string> debug_subdirs = {"fwd", "bwd", "optim", "weights"};
+  for (auto const &subdir : debug_subdirs) {
+    fs::path subdir_path = debug_dir / subdir;
+    fs::create_directory(subdir_path);
+  }
+}
+
 FFModel::FFModel(FFConfig &_config, bool cpu_offload)
     : op_global_guid(OP_GUID_FIRST_VALID),
       layer_global_guid(LAYER_GUID_FIRST_VALID),
+      peft_model_global_guid(PEFT_MODEL_ID_FIRST_VALID),
       tensor_global_guid(TENSOR_GUID_FIRST_VALID),
       parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID),
       node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0),
@@ -1586,6 +1630,9 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
   for (int idx = 0; idx < config.workersPerNode * config.numNodes; idx++) {
     handlers[idx] = ffruntime_singleton->handlers[idx];
   }
+  if (config.inference_debugging) {
+    make_debug_dirs();
+  }
   model_id = model_counter++;
 }
 
@@ -2932,7 +2979,8 @@ bool FFModel::apply_fusion(
     // don't fuse parallel op except allReduce since they have different
     // parallel_is in forward/backward
     if (operators[l]->is_parallel_op() &&
-        operators[l]->op_type != OP_ALLREDUCE) {
+        operators[l]->op_type != OP_ALLREDUCE &&
+        operators[l]->op_type != OP_PARALLEL_IDENTITY) {
       continue;
     }
     size_t start = 0;
@@ -2978,7 +3026,8 @@ bool FFModel::apply_fusion(
           // don't fuse parallel op except allReduce since they have different
           // parallel_is in forward/backward
           if (operators[i]->is_parallel_op() &&
-              operators[i]->op_type != OP_ALLREDUCE) {
+              operators[i]->op_type != OP_ALLREDUCE &&
+              operators[i]->op_type != OP_PARALLEL_IDENTITY) {
             continue;
           }
           fused_op = new FusedOp(*this, operators[i]);
@@ -3010,8 +3059,19 @@ bool FFModel::apply_fusion(
                     found = k;
                   }
                 }
-                assert(found >= 0);
-                op->inputs[idx] = fused_op->outputs[found];
+                if (found >= 0) {
+                  op->inputs[idx] = fused_op->outputs[found];
+                } else {
+                  for (int k = 0; k < fused_op->numInputs; k++) {
+                    if (fused_op->inputs[k]->region ==
+                        op->inputs[idx]->region) {
+                      assert(found == -1);
+                      found = k;
+                    }
+                  }
+                  assert(found >= 0);
+                  op->inputs[idx] = fused_op->inputs[found];
+                }
               }
             }
             // Insert op
@@ -3287,6 +3347,12 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
+    // PEFT layers
+    case OP_LORA: {
+      Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
+      operators.push_back(op);
+      return op;
+    }
     default:
       assert(false);
   }
@@ -3313,9 +3379,123 @@ bool FFModel::is_mlp_block(int layer_idx) const {
   return false;
 }
 
+bool FFModel::need_to_add_combine(int layer_idx) const {
+  if (config.computationMode != COMP_MODE_INFERENCE ||
+      config.tensor_parallelism_degree == 1 || layers.size() <= 2) {
+    return false;
+  }
+  auto const &l = layers[layer_idx];
+  // softmax followed by argmax/arg_topk: add combine before softmax
+  if (layer_idx == layers.size() - 2) {
+    auto const &l_next = layers[layer_idx + 1];
+    if (l->op_type == OP_SOFTMAX &&
+        (l_next->op_type == OP_ARG_TOPK || l_next->op_type == OP_ARGMAX)) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // argmax/arg_topk not precedent by softmax: add combine before
+  // argmax/arg_topk
+  if (layer_idx == layers.size() - 1 &&
+      (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX)) {
+    auto const &l_prev = layers[layer_idx - 1];
+    if (l_prev->op_type == OP_SOFTMAX) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool FFModel::need_to_add_allreduce(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+       // mlp layer
+       is_mlp_block(layer_idx) ||
+       // llama mlp layer
+       (l->op_type == OP_LINEAR && layer_idx >= 2 &&
+        layers[layer_idx - 1]->op_type == OP_GELU &&
+        layers[layer_idx - 2]->op_type == OP_LINEAR) ||
+       // LLAMA without element-wise operator fusion
+       (l->op_type == OP_LINEAR && layer_idx >= 5 &&
+        layers[layer_idx - 1]->op_type == OP_EW_MUL &&
+        layers[layer_idx - 2]->op_type == OP_EW_MUL &&
+        layers[layer_idx - 3]->op_type == OP_SIGMOID &&
+        layers[layer_idx - 4]->op_type == OP_LINEAR &&
+        layers[layer_idx - 5]->op_type == OP_LINEAR) ||
+       // LLAMA with element-wise operator fusion
+       (l->op_type == OP_LINEAR && layer_idx >= 3 &&
+        layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
+        layers[layer_idx - 2]->op_type == OP_LINEAR &&
+        layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+    return true;
+  }
+  return false;
+}
+
+#ifdef DEADCODE
+bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  // add parallel identity (allreduce in the backward pass) before the lm head
+  // we find the lm head by looking for the linear layer right after a residual
+  // rms norm / layer norm, and before a softmax, followed by
+  // argmax/argtopk/sampling
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_RESIDUAL_RMS_NORM ||
+        l->op_type == OP_RESIDUAL_LAYERNORM) &&
+       // there are at least 2 layers before the norm, and at least 3 following
+       // the norm
+       layer_idx >= 2 && layer_idx < layers.size() - 3 &&
+       // norm is followed by linear layer (lm head)
+       layers[layer_idx + 1]->op_type == OP_LINEAR &&
+       // lm head is followed by softmax
+       layers[layer_idx + 2]->op_type == OP_SOFTMAX &&
+       // softmax is followed by argmax/argtopk/sampling
+       (layers[layer_idx + 3]->op_type == OP_ARG_TOPK ||
+        layers[layer_idx + 3]->op_type == OP_SAMPLING ||
+        layers[layer_idx + 3]->op_type == OP_ARGMAX ||
+        layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) {
+    return true;
+  }
+  return false;
+}
+#endif
+bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  // add parallel identity (allreduce in the backward pass) before the lm head
+  // we find the lm head by looking for the linear layer right after a residual
+  // rms norm / layer norm, and before a softmax, followed by
+  // argmax/argtopk/sampling
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_RMS_NORM || l->op_type == OP_RESIDUAL_RMS_NORM ||
+        l->op_type == OP_LAYERNORM || l->op_type == OP_RESIDUAL_LAYERNORM) &&
+       // there are at least 2 layers before the norm, and at least 1 following
+       // the norm
+       layer_idx >= 2 && layer_idx < layers.size() - 1 &&
+       // norm is followed by linear layer or attention
+       (layers[layer_idx + 1]->op_type == OP_LINEAR ||
+        layers[layer_idx + 1]->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+        layers[layer_idx + 1]->op_type ==
+            OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+        layers[layer_idx + 1]->op_type ==
+            OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION))) {
+    return true;
+  }
+  return false;
+}
+
 void FFModel::create_operators_from_layers() {
   std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
-  // for (auto const &l : layers) {
+  std::map<const Tensor, ParallelTensor>
+      op_before_allreduce_tensors_to_parallel_tensors;
+  std::map<size_t, int> transformer_layer_allreduce_count;
+  std::map<size_t, int> transformer_layer_parallel_identity_count;
   for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) {
     auto const &l = layers[layer_idx];
     std::vector<ParallelTensor> inputs;
@@ -3323,14 +3503,19 @@ void FFModel::create_operators_from_layers() {
       // create new input tensors
       assert(tensors_to_parallel_tensors.find(l->inputs[i]) !=
              tensors_to_parallel_tensors.end());
-      inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]);
+      if (l->op_type == OP_LORA &&
+          op_before_allreduce_tensors_to_parallel_tensors.find(l->inputs[i]) !=
+              op_before_allreduce_tensors_to_parallel_tensors.end()) {
+        inputs.push_back(
+            op_before_allreduce_tensors_to_parallel_tensors[l->inputs[i]]);
+      } else {
+        inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]);
+      }
     }
     Op *op = nullptr;
-    // add a combine before arg_topk
-    if (config.computationMode == COMP_MODE_INFERENCE &&
-        config.tensor_parallelism_degree > 1 &&
-        (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
-         l->op_type == OP_ARGMAX)) {
+    // add a combine before last arg_max / arg_topk or before second-to-last
+    // softmax
+    if (need_to_add_combine(layer_idx)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,
@@ -3353,37 +3538,97 @@ void FFModel::create_operators_from_layers() {
       //                                 config.tensor_parallelism_degree);
       // operators.push_back(repl);
       // op = repl;
-    } else if (config.computationMode == COMP_MODE_INFERENCE &&
-               config.tensor_parallelism_degree > 1 &&
-               (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-                l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-                // mlp layer
-                is_mlp_block(layer_idx) ||
-                // llama mlp layer
-                (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-                 layers[layer_idx - 1]->op_type == OP_GELU &&
-                 layers[layer_idx - 2]->op_type == OP_LINEAR) ||
-                // LLAMA without element-wise operator fusion
-                (l->op_type == OP_LINEAR && layer_idx >= 5 &&
-                 layers[layer_idx - 1]->op_type == OP_EW_MUL &&
-                 layers[layer_idx - 2]->op_type == OP_EW_MUL &&
-                 layers[layer_idx - 3]->op_type == OP_SIGMOID &&
-                 layers[layer_idx - 4]->op_type == OP_LINEAR &&
-                 layers[layer_idx - 5]->op_type == OP_LINEAR) ||
-                // LLAMA with element-wise operator fusion
-                (l->op_type == OP_LINEAR && layer_idx >= 3 &&
-                 layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
-                 layers[layer_idx - 2]->op_type == OP_LINEAR &&
-                 layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+      assert(op->numOutputs == l->numOutputs);
+      for (int i = 0; i < op->numOutputs; i++) {
+        assert(tensors_to_parallel_tensors.find(l->outputs[i]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+      }
+    } else if (need_to_add_allreduce(layer_idx)) {
       assert(op->numOutputs == 1);
-      AllReduce *allreduce =
-          new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1);
+      size_t transformer_layer_id = op->layer_guid.transformer_layer_id;
+      if (transformer_layer_allreduce_count.find(transformer_layer_id) ==
+          transformer_layer_allreduce_count.end()) {
+        transformer_layer_allreduce_count[transformer_layer_id] = 0;
+      }
+      std::string allreduce_name = std::string(
+          "layers." + std::to_string(transformer_layer_id) + ".allreduce." +
+          std::to_string(
+              transformer_layer_allreduce_count[transformer_layer_id]));
+      transformer_layer_allreduce_count[transformer_layer_id]++;
+      AllReduce *allreduce = new AllReduce(*this,
+                                           op->outputs[0],
+                                           op->outputs[0]->num_dims - 1,
+                                           allreduce_name.c_str());
       operators.push_back(allreduce);
+      op_before_allreduce_tensors_to_parallel_tensors[l->outputs[0]] =
+          op->outputs[0];
       op = allreduce;
+      assert(op->numOutputs == l->numOutputs);
+      for (int i = 0; i < op->numOutputs; i++) {
+        assert(tensors_to_parallel_tensors.find(l->outputs[i]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+      }
+    } else if (need_to_add_parallel_identity(layer_idx)) {
+      assert(op->numOutputs == 1 || op->numOutputs == 2);
+      size_t transformer_layer_id = op->layer_guid.transformer_layer_id;
+      if (transformer_layer_parallel_identity_count.find(
+              transformer_layer_id) ==
+          transformer_layer_parallel_identity_count.end()) {
+        transformer_layer_parallel_identity_count[transformer_layer_id] = 0;
+      }
+      std::string parallel_identity_name = std::string(
+          "layers." + std::to_string(transformer_layer_id) +
+          ".parallel_identity." +
+          std::to_string(
+              transformer_layer_parallel_identity_count[transformer_layer_id]));
+      transformer_layer_parallel_identity_count[transformer_layer_id]++;
+      ParallelIdentity *parallel_identity = nullptr;
+      if (op->numOutputs == 1) {
+        parallel_identity =
+            new ParallelIdentity(*this,
+                                 op->outputs[0],
+                                 op->outputs[0]->num_dims - 1,
+                                 parallel_identity_name.c_str());
+      } else if (op->numOutputs == 2) {
+        parallel_identity =
+            new ParallelIdentity(*this,
+                                 op->outputs[1],
+                                 op->outputs[1]->num_dims - 1,
+                                 parallel_identity_name.c_str());
+        // output 0 is taken from the residual rms norm
+        assert(tensors_to_parallel_tensors.find(l->outputs[0]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[0]] = op->outputs[0];
+      } else {
+        assert(false &&
+               "Op needing ParallelIdentity has unexpected number of outputs");
+      }
+      operators.push_back(parallel_identity);
+      assert(op->numOutputs == l->numOutputs);
+      // last output is taken from the parallel identity
+      assert(tensors_to_parallel_tensors.find(l->outputs[op->numOutputs - 1]) ==
+             tensors_to_parallel_tensors.end());
+      tensors_to_parallel_tensors[l->outputs[l->numOutputs - 1]] =
+          parallel_identity->outputs[0];
+      op = parallel_identity;
+    } else {
+      assert(op->numOutputs == l->numOutputs);
+      for (int i = 0; i < op->numOutputs; i++) {
+        assert(tensors_to_parallel_tensors.find(l->outputs[i]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+      }
     }
-    assert(op->numOutputs == l->numOutputs);
-    for (int i = 0; i < op->numOutputs; i++) {
-      tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+    // if the operator has op_type==OP_LORA, and the second-to-last operator in
+    // the operators vector has op_type==OP_ALLREDUCE, move the operator before
+    // the ALLREDUCE
+    if (op->op_type == OP_LORA && operators.size() > 1 &&
+        operators[operators.size() - 2]->op_type == OP_ALLREDUCE) {
+      Op *tmp = operators[operators.size() - 2];
+      operators[operators.size() - 2] = operators[operators.size() - 1];
+      operators[operators.size() - 1] = tmp;
     }
   }
 }
@@ -3424,7 +3669,7 @@ void FFModel::compile(LossType loss_type,
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
@@ -3549,7 +3794,7 @@ void FFModel::compile(LossType loss_type,
     for (int i = 0; i < op->numInputs; i++) {
       assert(op->inputs[i]->owner_op != nullptr);
       if (op->inputs[i]->owner_op->op_type == OP_INPUT) {
-        op->trainableInputs[i] = false;
+        op->trainable_inputs[i] = false;
       }
     }
   }
@@ -3745,9 +3990,18 @@ bool FFModel::check_operators_integrity(
         }
         for (int i = 0; i < fused->op_num_outputs[op]; i++) {
           int my_off = fused->op_output_idx[i + ooff];
-          assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT);
-          assert(FusedOp::use_same_regions(
-              fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+          assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT ||
+                 (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT &&
+                  (old_op->op_type == OP_RESIDUAL_LAYERNORM ||
+                   old_op->op_type == OP_RESIDUAL_RMS_NORM ||
+                   old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)));
+          if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) {
+            assert(FusedOp::use_same_regions(
+                fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+          } else {
+            assert(FusedOp::use_same_regions(
+                fused->inputs[my_off], old_op->outputs[i], pt_mapping));
+          }
         }
         ioff += fused->op_num_inputs[op];
         woff += fused->op_num_weights[op];
@@ -4086,6 +4340,12 @@ struct DefaultConfig {
   const static bool searchOverlapBackwardUpdate = false;
   const static size_t offloadReserveSpaceSize =
       (size_t)8 * 1024 * 1024 * 1024; // 8 GB
+  // PEFT related fields
+  const static bool enablePeft = false;
+  const static size_t peftActivationReserveSpaceSize =
+      (size_t)1 * 1024 * 1024 * 1024; // 1GB
+  const static size_t peftWeightReserveSpaceSize =
+      (size_t)1 * 1024 * 1024 * 1024; // 1GB
   const static bool cpuOffload = false;
   const static bool onlyDataParallel = true;
   const static bool enableSampleParallel = true;
@@ -4122,6 +4382,11 @@ FFConfig::FFConfig() {
   computationMode = COMP_MODE_TRAINING;
   cpu_offload = DefaultConfig::cpuOffload;
   offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize;
+  // PEFT related fields
+  enable_peft = DefaultConfig::enablePeft;
+  peft_activation_reserve_space_size =
+      DefaultConfig::peftActivationReserveSpaceSize;
+  peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize;
   quantization_type = DT_NONE;
   only_data_parallel = DefaultConfig::onlyDataParallel;
   data_parallelism_degree = 1;
@@ -4248,6 +4513,18 @@ void FFConfig::parse_args(char **argv, int argc) {
       quantization_type = DT_INT8;
       continue;
     }
+    if ((!strcmp(argv[i], "-enable-peft"))) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-activation-reserve-space-size")) {
+      peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) {
+      peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
+      continue;
+    }
     if ((!strcmp(argv[i], "--only-data-parallel"))) {
       only_data_parallel = true;
       continue;
@@ -5383,6 +5660,38 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                                   "residual_layernorm_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualLayerNorm::backward_task>(
+          registrar, "residual_layernorm_backward_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualLayerNorm::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "residual_layernorm_peft_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualLayerNorm::peft_bwd_task>(
+          registrar, "residual_layernorm_peft_bwd_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualLayerNorm::peft_bwd_task>(
+          registrar);
+    }
+  }
   // AddBiasResidualLayerNorm task
   {
     TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
@@ -5419,6 +5728,40 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                                   "AddBiasResidualLayerNorm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          AddBiasResidualLayerNorm::backward_task>(
+          registrar, "AddBiasResidualLayerNorm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AddBiasResidualLayerNorm::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "AddBiasResidualLayerNorm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          AddBiasResidualLayerNorm::peft_bwd_task>(
+          registrar, "AddBiasResidualLayerNorm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AddBiasResidualLayerNorm::peft_bwd_task>(
+          registrar);
+    }
+  }
   // SigmoidSiluMulti task
   {
     TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID,
@@ -5452,6 +5795,38 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_BWD_TASK_ID,
+                                   "SigmoidSiluMulti Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<SigmoidSiluMulti::backward_task>(
+          registrar, "SigmoidSiluMulti Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<SigmoidSiluMulti::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
+                                   "SigmoidSiluMulti PEFT Bwd");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<SigmoidSiluMulti::peft_bwd_task>(
+          registrar, "SigmoidSiluMulti PEFT Bwd Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<SigmoidSiluMulti::peft_bwd_task>(
+          registrar);
+    }
+  }
   // rms norm task
   {
     TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task");
@@ -5495,7 +5870,36 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<RMSNorm::inference_task>(registrar);
     }
   }
-  // rms norm task
+  {
+    TaskVariantRegistrar registrar(RMSNORM_BWD_TASK_ID, "RMS Norm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<RMSNorm::backward_task>(
+          registrar, "RMS Norm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<RMSNorm::backward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RMSNORM_PEFT_BWD_TASK_ID,
+                                   "RMS Norm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<RMSNorm::peft_bwd_task>(
+          registrar, "RMS Norm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<RMSNorm::peft_bwd_task>(registrar);
+    }
+  }
+  // residual rms norm task
   {
     TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID,
                                    "Residual RMS Norm Init");
@@ -5519,7 +5923,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<ResidualRMSNorm::inference_task>(
-          registrar, "RMS Norm Inference Task");
+          registrar, "Residual RMS Norm Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
@@ -5528,6 +5932,51 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_BWD_TASK_ID,
+                                   "Residual RMS Norm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualRMSNorm::backward_task>(
+          registrar, "Residual RMS Norm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualRMSNorm::backward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                                   "Residual RMS Norm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualRMSNorm::peft_bwd_task>(
+          registrar, "Residual RMS Norm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualRMSNorm::peft_bwd_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "layernorm_peft_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LayerNorm::peft_bwd_task>(
+          registrar, "peft_bwd_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LayerNorm::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
@@ -5571,6 +6020,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Linear::inference_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID,
+                                   "Linear PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Linear::peft_bwd_task>(
+          registrar, "Linear PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Linear::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
@@ -5699,6 +6163,22 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Softmax::inference_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(SOFTMAX_PEFT_BWD_TASK_ID,
+                                   "Softmax PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Softmax::peft_bwd_task>(
+          registrar, "Softmax PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Softmax::peft_bwd_task>(registrar);
+    }
+  }
+
   // compute Loss
   {
     TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward");
@@ -6303,6 +6783,24 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(
+        INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+        "IncMultiHeadSelfAttention PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          IncMultiHeadSelfAttention::peft_bwd_task>(
+          registrar, "IncMultiHeadSelfAttention PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<IncMultiHeadSelfAttention::peft_bwd_task>(
+          registrar);
+    }
+  }
   // speculative MultiHeadAttention task
   {
     TaskVariantRegistrar registrar(
@@ -6380,6 +6878,54 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           TreeIncMultiHeadSelfAttention::inference_task>(registrar);
     }
   }
+  // PEFT tasks
+  // LoraLinear tasks
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<OpMeta *, LoraLinear::init_task>(
+          registrar, "LoraLinear Init Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<OpMeta *, LoraLinear::init_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID,
+                                   "LoraLinear Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::inference_task>(
+          registrar, "LoraLinear Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::inference_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                                   "LoraLinear PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::peft_bwd_task>(
+          registrar, "LoraLinear PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::peft_bwd_task>(registrar);
+    }
+  }
+
   // NoOp
   {
     TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init");
@@ -6411,31 +6957,47 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
+    TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<FusedOp::forward_task>(
-          registrar, "FusedOp Forward Task");
+      Runtime::preregister_task_variant<FusedOp::inference_task>(
+          registrar, "FusedOp Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FusedOp::forward_task>(registrar);
+      runtime->register_task_variant<FusedOp::inference_task>(registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
+    TaskVariantRegistrar registrar(FUSEDOP_PEFT_BWD_TASK_ID,
+                                   "FusedOp PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<FusedOp::inference_task>(
-          registrar, "FusedOp Inference Task");
+      Runtime::preregister_task_variant<FusedOp::peft_bwd_task>(
+          registrar, "FusedOp PEFT Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FusedOp::inference_task>(registrar);
+      runtime->register_task_variant<FusedOp::peft_bwd_task>(registrar);
+    }
+  }
+
+  {
+    TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<FusedOp::forward_task>(
+          registrar, "FusedOp Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FusedOp::forward_task>(registrar);
     }
   }
   {
@@ -6529,6 +7091,20 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Combine::forward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(COMBINE_INF_TASK_ID, "Combine Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Combine::inference_task>(
+          registrar, "Combine Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Combine::inference_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(COMBINE_BWD_TASK_ID, "Combine Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
@@ -6543,6 +7119,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Combine::backward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(COMBINE_PEFT_BWD_TASK_ID,
+                                   "Combine PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Combine::peft_bwd_task>(
+          registrar, "Combine PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Combine::peft_bwd_task>(registrar);
+    }
+  }
   // Replicate
   {
     TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init");
@@ -6586,6 +7177,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Replicate::backward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID,
+                                   "Replicate PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Replicate::peft_bwd_task>(
+          registrar, "Replicate PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Replicate::peft_bwd_task>(registrar);
+    }
+  }
   // Reduction
   {
     TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init");
@@ -6644,6 +7250,34 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<OpMeta *, AllReduce::init_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<AllReduce::forward_task>(
+          registrar, "AllReduce Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AllReduce::forward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<AllReduce::backward_task>(
+          registrar, "AllReduce Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AllReduce::backward_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID,
                                    "AllReduce Inference");
@@ -6660,33 +7294,101 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
+    TaskVariantRegistrar registrar(ALLREDUCE_PEFT_BWD_TASK_ID,
+                                   "AllReduce PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::forward_task>(
-          registrar, "AllReduce Forward Task");
+      Runtime::preregister_task_variant<AllReduce::peft_bwd_task>(
+          registrar, "AllReduce PEFT Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::forward_task>(registrar);
+      runtime->register_task_variant<AllReduce::peft_bwd_task>(registrar);
     }
   }
+  // ParallelIdentity
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INIT_TASK_ID,
+                                   "ParallelIdentity Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::backward_task>(
-          registrar, "AllReduce Backward Task");
+      Runtime::preregister_task_variant<OpMeta *, ParallelIdentity::init_task>(
+          registrar, "ParallelIdentity init Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::backward_task>(registrar);
+      runtime->register_task_variant<OpMeta *, ParallelIdentity::init_task>(
+          registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_FWD_TASK_ID,
+                                   "ParallelIdentity Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::forward_task>(
+          registrar, "ParallelIdentity Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::forward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_BWD_TASK_ID,
+                                   "ParallelIdentity Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::backward_task>(
+          registrar, "ParallelIdentity Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INF_TASK_ID,
+                                   "ParallelIdentity Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::inference_task>(
+          registrar, "ParallelIdentity Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::inference_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
+                                   "ParallelIdentity PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::peft_bwd_task>(
+          registrar, "ParallelIdentity PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::peft_bwd_task>(
+          registrar);
+    }
+  }
+
   // FusedParallelOp
   {
     TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID,
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index 62f6b89b7f..9f3e2fbb10 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -165,8 +165,8 @@ FFHandler
                                            0,
                                            Realm::ProfilingRequestSet())
         .wait();
-    handle.batch_config_metadata =
-        workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata = static_cast<CombinedBatchConfigMetaStruct *>(
+        workspaceInst.pointer_untyped(0, sizeof(char)));
   } else {
     handle.batch_config_metadata = nullptr;
   }
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index fd39ed0db0..5dab73e1a4 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -14,6 +14,8 @@
  */
 #include "flexflow/model.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "flexflow/utils/memory_allocator.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -161,12 +163,51 @@ FFHandler
                                            0,
                                            Realm::ProfilingRequestSet())
         .wait();
-    handle.batch_config_metadata =
-        workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata = static_cast<CombinedBatchConfigMetaStruct *>(
+        workspaceInst.pointer_untyped(0, sizeof(char)));
   } else {
     handle.batch_config_metadata = nullptr;
   }
 
+  if (info->peft_activation_reserve_space_size > 0) {
+    // allocate memory for peft activation reserve space
+    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                         .only_kind(Memory::GPU_FB_MEM)
+                         .best_affinity_to(task->target_proc)
+                         .first();
+    Realm::RegionInstance workspaceInst;
+    handle.peft_activation_allocator = new MemoryAllocator(gpu_mem);
+    handle.peft_activation_allocator->create_legion_instance(
+        workspaceInst, info->peft_activation_reserve_space_size);
+  } else {
+    handle.peft_activation_allocator = nullptr;
+  }
+
+  if (info->peft_weight_reserve_space_size > 0) {
+    // allocate memory for peft weight reserve space
+    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                         .only_kind(Memory::GPU_FB_MEM)
+                         .best_affinity_to(task->target_proc)
+                         .first();
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance workspaceInst;
+    Realm::RegionInstance::create_instance(workspaceInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.peft_weight_allocator =
+        new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size);
+  } else {
+    handle.peft_weight_allocator = nullptr;
+  }
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index 36ac02a3a3..dcac52397a 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -2,14 +2,7 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/simulator.h"
 #include <stdexcept>
-
-#include <sys/stat.h>
-#include <sys/types.h>
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "flexflow/utils/cuda_helper.h"
-#else
-#include "flexflow/utils/hip_helper.h"
-#endif
+#include <wordexp.h>
 
 namespace FlexFlow {
 
@@ -25,4 +18,31 @@ size_t Op::get_params_hash() const {
       get_operator_type_name(this->op_type));
 }
 
+fs::path get_dst_folder(std::string const &subdir,
+                        int step_idx,
+                        int shard_idx,
+                        bool before_kernel) {
+  std::vector<std::string> debug_subdirs = {"fwd", "bwd", "optim", "weights"};
+  assert(std::find(debug_subdirs.begin(), debug_subdirs.end(), subdir) !=
+         debug_subdirs.end());
+  std::string step_substr = "step_" + std::to_string(step_idx);
+  if (before_kernel) {
+    step_substr += "_pre";
+  }
+  char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+  std::string debug_dir_ =
+      ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow"
+                    : std::string("~/.cache/flexflow/debug/flexflow");
+  wordexp_t p;
+  wordexp(debug_dir_.c_str(), &p, 0);
+  debug_dir_ = p.we_wordv[0];
+  wordfree(&p);
+  fs::path debug_dir = debug_dir_;
+  assert(fs::is_directory(debug_dir));
+  fs::path dst_folder =
+      debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx));
+  fs::create_directories(dst_folder);
+  return dst_folder;
+}
+
 }; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc
index 6b2d223f54..e9feb86eb5 100644
--- a/src/runtime/operator_params.cc
+++ b/src/runtime/operator_params.cc
@@ -42,6 +42,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -119,6 +120,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((Combine *)op)->get_params();
     case OP_ALLREDUCE:
       return ((AllReduce *)op)->get_params();
+    case OP_PARALLEL_IDENTITY:
+      return ((ParallelIdentity *)op)->get_params();
     case OP_FUSED_PARALLEL:
       return ((FusedParallelOp *)op)->get_params();
     case OP_TRANSPOSE:
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bada87ab19..31a32dd3c8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -14,6 +14,8 @@
  */
 
 #include "flexflow/request_manager.h"
+#include "flexflow/ops/fused.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
@@ -21,6 +23,7 @@
 #include <future>
 #include <iomanip>
 #include <new>
+#include <nlohmann/json.hpp>
 #include <stack>
 #include <stdexcept>
 
@@ -28,12 +31,16 @@ namespace FlexFlow {
 
 using namespace Legion;
 using tokenizers::Tokenizer;
+using json = nlohmann::json;
 
 Legion::Logger log_req_mgr("RequestManager");
 
 std::string LoadBytesFromFile(std::string const &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
-  assert(!fs.fail() && "no such file");
+  if (fs.fail()) {
+    std::cerr << "Failed to open file: " << path << std::endl;
+    assert(false);
+  }
   std::string data;
   fs.seekg(0, std::ios::end);
   size_t size = static_cast<size_t>(fs.tellg());
@@ -43,6 +50,52 @@ std::string LoadBytesFromFile(std::string const &path) {
   return data;
 }
 
+std::ostream &operator<<(std::ostream &os, Request const &req) {
+  os << "Request {\n";
+  os << "  guid: " << req.guid << "\n";
+  os << "  peft_model_id: " << req.peft_model_id << "\n";
+  os << "  max_sequence_length: " << req.max_sequence_length << "\n";
+  os << "  initial_len: " << req.initial_len << "\n";
+  os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
+  os << "  llm_cache_size: " << req.llm_cache_size << "\n";
+  os << "  status: " << static_cast<int>(req.status) << "\n";
+  os << "  tokens: [";
+  for (auto const &token : req.tokens) {
+    os << token << " ";
+  }
+  os << "]\n";
+  os << "  prompt: " << req.prompt << "\n";
+  // os << "  beam_trees: [";
+  // for (const auto& tree : req.beam_trees) {
+  //     // Assuming BeamTree has its own << operator defined
+  //     os << tree << " ";
+  // }
+  // os << "]\n";
+  os << "  req_type: " << static_cast<int>(req.req_type) << "\n";
+  os << "  completed_training_steps: " << req.completed_training_steps << "\n";
+  os << "  gradient_accumulation_steps: " << req.gradient_accumulation_steps
+     << "\n";
+  os << "  max_training_steps: " << req.max_training_steps << "\n";
+  os << "  dataset_filepath: " << req.dataset_filepath << "\n";
+  os << "  dataset: [";
+  for (auto const &pair : req.dataset) {
+    os << "[";
+    for (auto const &token : pair.first) {
+      os << token << " ";
+    }
+    os << "], [";
+    for (auto const &token : pair.second) {
+      os << token << " ";
+    }
+    os << "] ";
+  }
+  os << "]\n";
+  os << "}\n";
+  return os;
+}
+
+bool RequestManager::inference_finished = false;
+
 RequestManager::RequestManager()
     : request_manager_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -114,6 +167,14 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) {
   spec_infer_tree_width.emplace_back(tree_width);
 }
 
+void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) {
+  enable_peft_finetuning = enable_peft_finetuning_;
+}
+
+void RequestManager::set_inference_finished(bool finished) {
+  inference_finished = finished;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -121,33 +182,45 @@ void RequestManager::register_tokenizer(ModelType type,
   this->model_type = type;
   this->bos_token_id = bos_token_id;
   this->eos_token_id = eos_token_id;
-  std::string tokenizer_folder =
-      (!path.empty() && path.back() != '/') ? path + '/' : path;
+  std::filesystem::path tokenizer_folder(path);
+
   if (model_type == ModelType::LLAMA) {
-    bool path_to_file = !path.empty() &&
-                        (path.size() >= strlen("tokenizer.model")) &&
-                        path.find("tokenizer.model") ==
-                            (path.size() - strlen("tokenizer.model"));
-    std::string tokenizer_filepath =
-        path_to_file ? path : tokenizer_folder + "tokenizer.model";
-    this->tokenizer_ =
-        Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
+    std::filesystem::path tokenizer_model_path;
+    if (std::filesystem::is_directory(tokenizer_folder)) {
+      tokenizer_model_path =
+          std::filesystem::path(tokenizer_folder) / "tokenizer.model";
+    } else {
+      tokenizer_model_path = tokenizer_folder;
+    }
+    if (std::filesystem::exists(tokenizer_model_path)) {
+      // load from tokenizer.model
+      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
+          LoadBytesFromFile(tokenizer_model_path.string()));
+    } else {
+      // load from tokenizer.json
+      std::filesystem::path tokenizer_json_path =
+          tokenizer_folder / "tokenizer.json";
+      if (!std::filesystem::exists(tokenizer_json_path)) {
+        std::cerr << "Failed to open file: " << tokenizer_json_path
+                  << std::endl;
+        assert(false);
+      }
+      this->tokenizer_ = Tokenizer::FromBlobJSON(
+          LoadBytesFromFile(tokenizer_json_path.string()));
+    }
   } else if (model_type == ModelType::OPT) {
-    std::string vocab_file = tokenizer_folder + "vocab.json";
-    std::string merges_file = tokenizer_folder + "merges.txt";
-    std::string added_tokens_file =
-        tokenizer_folder + "special_tokens_map.json";
-    std::filesystem::path path1(vocab_file);
-    std::filesystem::path path2(merges_file);
-    std::filesystem::path path3(added_tokens_file);
-    assert(std::filesystem::exists(path1) &&
+    std::filesystem::path vocab_file = tokenizer_folder / "vocab.json";
+    std::filesystem::path merges_file = tokenizer_folder / "merges.txt";
+    std::filesystem::path added_tokens_file =
+        tokenizer_folder / "special_tokens_map.json";
+    assert(std::filesystem::exists(vocab_file) &&
            "Vocab file vocab.json does not exist at the specified path");
-    assert(std::filesystem::exists(path2) &&
+    assert(std::filesystem::exists(merges_file) &&
            "Merge file merges.txt does not exist at the specified path");
     // opt_tokenizer = new OptTokenizer(vocab_file, merges_file);
-    std::string vocab = LoadBytesFromFile(path1.string());
-    std::string merges = LoadBytesFromFile(path2.string());
-    std::string added_tokens = LoadBytesFromFile(path3.string());
+    std::string vocab = LoadBytesFromFile(vocab_file.string());
+    std::string merges = LoadBytesFromFile(merges_file.string());
+    std::string added_tokens = LoadBytesFromFile(added_tokens_file.string());
 
     this->tokenizer_ =
         Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
@@ -182,28 +255,40 @@ size_t RequestManager::get_num_ssms() {
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::vector<TokenId> const &prompt,
-                                         int max_sequence_length) {
+    RequestManager::register_new_request(Request const &request_) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-
-  if (prompt.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << prompt.size() << ".\n";
-
-    printf("tokens size: %zu\n", request.tokens.size());
-    return INVALID_GUID;
+  request.max_sequence_length = request_.max_sequence_length;
+  request.peft_model_id = request_.peft_model_id;
+  request.warmup = request_.warmup;
+  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+    request.tokens.push_back(bos_token_id);
+  }
+  if (request_.benchmarking_tokens >= 0) {
+    assert(request_.benchmarking_tokens < get_max_sequence_length());
+    request.benchmarking_tokens = request_.benchmarking_tokens;
+    request.tokens.insert(request.tokens.end(),
+                          request_.benchmarking_tokens,
+                          15); // insert random number
   } else {
-    request.initial_len = prompt.size();
-    request.tokens = prompt;
+    std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
+    if (tokens.size() >= get_max_sequence_length()) {
+      std::cout << "Warning: too many tokens in prompt, only load up to "
+                << get_max_sequence_length() << " tokens, but got "
+                << tokens.size() << ".\n";
+      return INVALID_GUID;
+    }
+    for (int i = 0; i < tokens.size(); i++) {
+      std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+    }
+    request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   }
 
+  request.initial_len = request.tokens.size();
+
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
                  "decoding."
@@ -216,58 +301,111 @@ RequestManager::RequestGuid
     }
   }
 
-  pending_request_queue.push(request);
+  pending_infr_request_queue.push(request);
   all_requests[request.guid] = request;
   {
     const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
-  if (verbose) {
-    std::cout << "new req: " << request.tokens.size() << std::endl;
+  {
+    std::string output = "New request tokens:";
+    output = "[" + std::to_string(request.guid) + "]" + output;
     for (int i = 0; i < request.tokens.size(); i++) {
-      std::cout << i << " : " << request.tokens[i] << std::endl;
+      output = output + " " + std::to_string(request.tokens[i]);
     }
+    log_req_mgr.print("%s", output.c_str());
   }
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = "";
-  gr.input_tokens = prompt;
-  gr.output_text = "";
-  gr.output_tokens = prompt;
+  gr.input_text = request_.prompt;
+  gr.input_tokens = request.tokens;
+  gr.output_text = request_.prompt;
+  gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
 
+  ProfileInfo profile_info;
+  profile_info.registration_time = Realm::Clock::current_time_in_microseconds();
+  profiling_requests[request.guid] = profile_info;
+
   return request.guid;
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::string const &prompt,
-                                         int max_sequence_length) {
+    RequestManager::register_new_peft_request(Request const &request_) {
+  assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
-    request.tokens.push_back(bos_token_id);
+  request.initial_len = 0;
+  request.max_sequence_length = request_.max_sequence_length;
+  request.peft_model_id = request_.peft_model_id;
+  request.req_type = RequestType::REQ_FINETUNING;
+  request.completed_training_steps = 0;
+  request.gradient_accumulation_steps = request_.gradient_accumulation_steps;
+  request.max_training_steps = request_.max_training_steps;
+  request.dataset_filepath = request_.dataset_filepath;
+  request.warmup = request_.warmup;
+
+  // Load dataset
+  if (request_.benchmarking_tokens >= 0) {
+    assert(request_.benchmarking_tokens <= get_max_sequence_length());
+    request.benchmarking_tokens = request_.benchmarking_tokens;
+    std::vector<int32_t> input_tokens;
+    std::vector<int32_t> output_tokens;
+    bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON);
+    if (bos_added) {
+      input_tokens.push_back(bos_token_id);
+    }
+    input_tokens.insert(input_tokens.end(),
+                        request_.benchmarking_tokens - (int)bos_added,
+                        15); // insert random number
+    request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
+  } else {
+    using json = nlohmann::json;
+    std::ifstream file_handle(request.dataset_filepath);
+    assert(file_handle.good() && "Dataset file does not exist.");
+    json dataset_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+
+    for (auto &prompt : dataset_json) {
+      std::string text = prompt.get<std::string>();
+      std::string output_text("");
+      std::vector<int32_t> input_tokens;
+      input_tokens = this->tokenizer_->Encode(text);
+      if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+        input_tokens.insert(input_tokens.begin(), bos_token_id);
+      }
+      std::vector<int32_t> output_tokens =
+          this->tokenizer_->Encode(output_text);
+      if (input_tokens.size() + output_tokens.size() >
+          get_max_sequence_length()) {
+        std::cout << "Warning: too many tokens in sample, only load up to "
+                  << get_max_sequence_length() << " tokens, but got "
+                  << input_tokens.size() + output_tokens.size() << ".\n";
+        return INVALID_GUID;
+      } else {
+        request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
+      }
+    }
   }
-  std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
-  if (tokens.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << tokens.size() << ".\n";
 
-    printf("tokens size: %zu\n", tokens.size());
-    return INVALID_GUID;
+  if (request.gradient_accumulation_steps == -1) {
+    request.gradient_accumulation_steps = request.dataset.size();
   }
-  for (int i = 0; i < tokens.size(); i++) {
-    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-  }
-  request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
-  request.initial_len = request.tokens.size();
+  assert(request.gradient_accumulation_steps > 0 &&
+         "Invalid gradient accumulation steps");
+  assert(request.gradient_accumulation_steps <= request.max_training_steps &&
+         "Gradient accumulation steps should be less than or equal to max "
+         "training steps");
 
+  // Currently don't support speculative inference for PEFT
+  assert(get_num_ssms() == 0);
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
                  "decoding."
@@ -280,29 +418,38 @@ RequestManager::RequestGuid
     }
   }
 
-  pending_request_queue.push(request);
+  pending_peft_request_queue.push(request);
   all_requests[request.guid] = request;
   {
     const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
-  {
-    std::string output = "New request tokens:";
-    output = "[" + std::to_string(request.guid) + "]" + output;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      output = output + " " + std::to_string(request.tokens[i]);
+  for (size_t r = 0; r < request.dataset.size(); r++) {
+    std::string input = "[" + std::to_string(r) + "] input:";
+    std::string output = "[" + std::to_string(r) + "] output:";
+    for (size_t i = 0; i < request.dataset[r].first.size(); i++) {
+      input = input + " " + std::to_string(request.dataset[r].first[i]);
     }
+    for (size_t i = 0; i < request.dataset[r].second.size(); i++) {
+      output = output + " " + std::to_string(request.dataset[r].second[i]);
+    }
+    log_req_mgr.print("%s", input.c_str());
     log_req_mgr.print("%s", output.c_str());
   }
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = prompt;
-  gr.input_tokens = request.tokens;
-  gr.output_text = prompt;
-  gr.output_tokens = request.tokens;
+  // gr.input_text = prompt;
+  // gr.input_tokens = request.tokens;
+  // gr.output_text = prompt;
+  // gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
+
+  ProfileInfo profile_info;
+  profile_info.registration_time = Realm::Clock::current_time_in_microseconds();
+  profiling_requests[request.guid] = profile_info;
+
   return request.guid;
 }
 
@@ -363,51 +510,117 @@ BatchConfig RequestManager::prepare_next_batch_task(
   return rm->prepare_next_batch(*bc, result);
 }
 
+bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
+                                              int i) {
+  Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+  bool request_completed = false;
+  // printf("model_type = %d\n", this->model_type);
+  if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+    request_completed = true;
+  } else if (request.tokens.back() == eos_token_id) {
+    // Encounter EOS token id
+    request_completed = true;
+  }
+  return request_completed;
+}
+
+void RequestManager::check_batch(BatchConfig const &old_bc,
+                                 BatchConfig const &new_bc) {
+  int num_incomplete_prompts = 0;
+  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+    if (new_bc.request_completed[i]) {
+      continue;
+    }
+    // ensure there is no request with zero tokens
+    assert(new_bc.requestsInfo[i].num_tokens_in_batch > 0);
+    // ensure there is no more than one incomplete prompt
+    if (new_bc.requestsInfo[i].prompt_phase &&
+        new_bc.requestsInfo[i].num_tokens_in_batch +
+                new_bc.requestsInfo[i].first_token_depth_in_request <
+            all_requests[new_bc.requestsInfo[i].request_guid].tokens.size()) {
+      num_incomplete_prompts++;
+    }
+  }
+  if (num_incomplete_prompts > 1) {
+    std::cout << "Error: more than one incomplete prompt in the batch\n";
+    pid_t pid = getpid();
+    std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt";
+    std::ofstream filen(filenamen);
+    if (filen.is_open()) {
+      filen << new_bc << std::endl;
+      filen.close();
+      std::cout << "String written to file: " << filenamen << std::endl;
+    } else {
+      std::cout << "Unable to open file: " << filenamen << std::endl;
+    }
+    std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt";
+    std::ofstream fileo(filenameo);
+    if (fileo.is_open()) {
+      fileo << old_bc << std::endl;
+      fileo.close();
+      std::cout << "String written to file: " << filenameo << std::endl;
+    } else {
+      std::cout << "Unable to open file: " << filenameo << std::endl;
+    }
+    assert(false);
+  }
+}
+
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
   // Step 1: append result from previous iteration to request's tokens
-  for (int i = 0; i < old_bc.num_tokens; i++) {
+  for (int i = 0; i < old_bc.num_active_tokens(); i++) {
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
+    if (request.req_type == RequestType::REQ_FINETUNING) {
+      continue;
+    }
     if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
       // This is a prompt token
       continue;
     } else {
+      // This is a decoding token
       assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 ==
              request.tokens.size());
-      // This is a decoding token
+      if (!profiling_requests[guid].first_token_time_set) {
+        profiling_requests[guid].first_token_time =
+            Realm::Clock::current_time_in_microseconds();
+        profiling_requests[guid].first_token_time_set = true;
+      }
       log_req_mgr.print("Output token is: %d", result.token_ids[i]);
       request.tokens.push_back(result.token_ids[i]);
       // std::string output = this->tokenizer_->Decode(request.tokens);
       // log_req_mgr.print("Output: %s", output.c_str());
     }
   }
+
   int num_generation_tokens = 0;
   int num_active_req = -1;
 
-  // Step 2: prepare the next batch for existing requests
+  // when finetuning is enabled, the last entry in the batch cannot be used for
+  // inference
+  int inference_batch_size =
+      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+
+  // Step 2: prepare the next batch for existing inference requests
   BatchConfig new_bc;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i]) { // add new requests to the next batch
+  for (int i = 0; i < inference_batch_size; i++) {
+    if (old_bc.request_completed[i]) {
+      // no need to carry over tokens to new batch for this request
       continue;
     } else {
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+      assert(request.req_type == RequestType::REQ_INFERENCE &&
+             "Found misplaced finetuning request");
+
       int processed_tokens =
           old_bc.requestsInfo[i].first_token_depth_in_request +
           old_bc.requestsInfo[i].num_tokens_in_batch;
       assert(processed_tokens < request.tokens.size());
-      bool request_completed = false;
-      // printf("model_type = %d\n", this->model_type);
-      if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
-        request_completed = true;
-      } else if (request.tokens.back() == eos_token_id) {
-        // Encounter EOS token id
-        request_completed = true;
-      }
+      bool request_completed = check_inf_req_completion(old_bc, i);
       if (request_completed) {
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
@@ -435,32 +648,40 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         total_request_run_time +=
             profile_info.finish_time - profile_info.start_time;
         profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
+        log_req_mgr.print("[%s] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
+                          "finish(%.1lf) latency(%.1lf) ttft(%.1lf)",
+                          request.warmup ? "Warmup" : "Profile",
+                          request.guid,
+                          profile_info.llm_decoding_steps,
+                          profile_info.start_time,
+                          profile_info.finish_time,
+                          profile_info.finish_time - profile_info.start_time,
+                          profile_info.first_token_time -
+                              profile_info.registration_time);
         // Write output to file if needed:
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
           if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
+            outputFile << "[" << (request.warmup ? "Warmup" : "Profile")
+                       << "] guid(" << request.guid << ") llm_decoding_steps("
+                       << profile_info.llm_decoding_steps << ") latency("
+                       << std::fixed << std::setprecision(3)
+                       << (profile_info.finish_time - profile_info.start_time)
+                       << ") ttft(" << std::fixed << std::setprecision(3)
+                       << (profile_info.first_token_time -
+                           profile_info.registration_time)
+                       << ")\n";
+            if (request.benchmarking_tokens <= 0) {
+              outputFile << "token IDs: ";
+              for (int i = 0; i < request.tokens.size(); i++) {
+                outputFile << request.tokens[i];
+                if (i < request.tokens.size() - 1) {
+                  outputFile << ",";
+                }
               }
+              outputFile << std::endl;
+              outputFile << output;
             }
-            outputFile << std::endl;
-            outputFile << output;
             outputFile.close();
           } else {
             std::cout << "Unable to open the output file: " << output_filepath
@@ -468,13 +689,15 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             assert(false);
           }
         }
-
       } else {
         new_bc.request_completed[i] = false;
         new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
+        new_bc.requestsInfo[i].peft_model_id =
+            old_bc.requestsInfo[i].peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_sequence_length =
             old_bc.requestsInfo[i].max_sequence_length;
         num_active_req++;
@@ -487,8 +710,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.requestsInfo[i].prompt_phase = false;
         } else {
           // Prompt phase
+          assert(old_bc.requestsInfo[i].prompt_phase == true);
+          int space_for_incr_dec_requests = 0;
+          // If the prompt can't fit in the batch, compute how much space we
+          // need to leave out for incomplete requests in decoding phase at
+          // higher indices.
+          for (int ii = i + 1; ii < inference_batch_size; ii++) {
+            if (old_bc.request_completed[ii]) {
+              continue;
+            }
+            Request &old_request =
+                all_requests[old_bc.requestsInfo[ii].request_guid];
+            bool req_completed = check_inf_req_completion(old_bc, ii);
+            if (!req_completed) {
+              space_for_incr_dec_requests++;
+            }
+          }
           new_bc.requestsInfo[i].num_tokens_in_batch =
-              std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
+              std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
+                           space_for_incr_dec_requests,
                        (int)request.tokens.size() -
                            new_bc.requestsInfo[i].first_token_depth_in_request);
           new_bc.requestsInfo[i].prompt_phase = true;
@@ -509,13 +749,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
   new_bc.num_generation_tokens = num_generation_tokens;
 
-  // Step 3: add new requests to the next batch
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+  // Step 3: add new inference requests to the next batch if there is space
+  for (int i = 0; i < inference_batch_size; i++) {
     if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        assert(new_request.req_type == RequestType::REQ_INFERENCE);
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
@@ -526,15 +767,16 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_sequence_length =
             new_request.max_sequence_length;
+        new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
         new_bc.requestsInfo[i].prompt_phase = true;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 1;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
+        // add start time to profile_info for the new request
+        profiling_requests[new_request.guid].llm_decoding_steps = 1;
+        profiling_requests[new_request.guid].start_time =
+            Realm::Clock::current_time_in_microseconds();
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
           int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
@@ -551,6 +793,170 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     }
   }
 
+  if (enable_peft_finetuning &&
+      !old_bc.request_completed[inference_batch_size]) {
+    assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0);
+    Request &request =
+        all_requests[old_bc.requestsInfo[inference_batch_size].request_guid];
+    assert(request.req_type == RequestType::REQ_FINETUNING &&
+           "Found misplaced inference request");
+
+    request.finetuning_losses.push_back(result.finetuning_loss);
+
+    request.dataset_entry_processed_tokens +=
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    request.processed_finetuning_tokens +=
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    request.finetuning_tokens_per_batch.push_back(
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch);
+    int dataset_entry =
+        request.completed_training_steps % request.dataset.size();
+    if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request +
+            old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch ==
+        request.dataset[dataset_entry].first.size()) {
+      // completed the current dataset entry
+      assert(request.dataset_entry_processed_tokens ==
+             request.dataset[dataset_entry].first.size());
+      request.completed_training_steps += 1;
+      request.dataset_entry_processed_tokens = 0;
+    }
+
+    assert(request.completed_training_steps <= request.max_training_steps);
+    if (request.completed_training_steps == request.max_training_steps ||
+        inference_finished) {
+      // check if the fine tuning request has completed
+      request.status = Request::COMPLETED;
+
+      GenerationResult &gr = request_generation_results[request.guid];
+      assert(gr.guid == request.guid);
+      gr.finetuning_losses = request.finetuning_losses;
+      trigger_request_completion_future(request.guid);
+      num_processed_requests++;
+
+      ProfileInfo profile_info = profiling_requests[request.guid];
+      profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+      total_request_run_time +=
+          profile_info.finish_time - profile_info.start_time;
+      profiling_requests[request.guid] = profile_info;
+      log_req_mgr.print("[%s] guid(%zu) completed_training_steps(%d) "
+                        "processed_finetuning_tokens(%lu) latency(%.1lf)",
+                        request.warmup ? "Warmup" : "Finetuning",
+                        request.guid,
+                        request.completed_training_steps,
+                        request.processed_finetuning_tokens,
+                        profile_info.finish_time - profile_info.start_time);
+      if (!output_filepath.empty()) {
+        std::ofstream outputFile(output_filepath, std::ios::app);
+        if (outputFile.is_open()) {
+          std::string tokens_str = "[";
+          for (size_t i = 0; i < request.finetuning_tokens_per_batch.size();
+               i++) {
+            tokens_str +=
+                std::to_string(request.finetuning_tokens_per_batch[i]);
+            if (i != request.finetuning_tokens_per_batch.size() - 1) {
+              tokens_str += ", ";
+            }
+          }
+          tokens_str += "]";
+          outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning")
+                     << "] guid(" << request.guid
+                     << ") completed_training_steps("
+                     << request.completed_training_steps
+                     << ") processed_finetuning_tokens("
+                     << request.processed_finetuning_tokens << ") latency("
+                     << std::fixed << std::setprecision(3)
+                     << (profile_info.finish_time - profile_info.start_time)
+                     << ") tokens_per_batch(" << tokens_str << ")\n";
+          outputFile.close();
+        } else {
+          std::cout << "Unable to open the output file: " << output_filepath
+                    << std::endl;
+          assert(false);
+        }
+      }
+    }
+  }
+
+  // Step 4: add PEFT bwd requests, if there is additional space
+  while (pending_peft_request_queue.size() > 0) {
+    Request &request = pending_peft_request_queue.front();
+    // assert(request.req_type = RequestType::REQ_FINETUNING);
+    Request &all_req_handle = all_requests[request.guid];
+    // assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
+    if (all_req_handle.status == Request::COMPLETED) {
+      pending_peft_request_queue.pop();
+    } else {
+      break;
+    }
+  }
+
+  if (pending_peft_request_queue.size() > 0 && !inference_finished) {
+    Request &request = pending_peft_request_queue.front();
+    assert(request.req_type = RequestType::REQ_FINETUNING);
+    assert(request.dataset.size() > 0);
+    // update status and training steps
+    Request &all_req_handle = all_requests[request.guid];
+    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
+
+    request.completed_training_steps = all_req_handle.completed_training_steps;
+    request.processed_finetuning_tokens =
+        all_req_handle.processed_finetuning_tokens;
+    request.status = all_req_handle.status;
+    int dataset_entry =
+        request.completed_training_steps % request.dataset.size();
+    request.dataset_entry_processed_tokens =
+        all_req_handle.dataset_entry_processed_tokens;
+    request.gradient_accumulation_steps =
+        all_req_handle.gradient_accumulation_steps;
+
+    assert(request.status != Request::COMPLETED);
+    assert(request.max_training_steps > 0 &&
+           request.completed_training_steps < request.max_training_steps);
+    assert(request.dataset_entry_processed_tokens <=
+           request.dataset[dataset_entry].first.size());
+
+    int num_peft_tokens =
+        min((int)request.dataset[dataset_entry].first.size() -
+                request.dataset_entry_processed_tokens,
+            get_max_tokens_per_batch() - new_bc.num_active_infr_tokens());
+    int num_peft_label_tokens = request.dataset[dataset_entry].second.size();
+    assert(num_peft_label_tokens == 0);
+
+    if (num_peft_tokens > 0) {
+      assert(new_bc.request_completed[inference_batch_size]);
+      // request info
+      new_bc.request_completed[inference_batch_size] = false;
+      new_bc.requestsInfo[inference_batch_size].first_token_depth_in_request =
+          request.dataset_entry_processed_tokens;
+      new_bc.requestsInfo[inference_batch_size].first_token_offset_in_batch =
+          new_bc.num_active_infr_tokens();
+      new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
+          num_peft_tokens;
+      new_bc.requestsInfo[inference_batch_size].max_sequence_length =
+          request.max_sequence_length;
+      new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
+      new_bc.requestsInfo[inference_batch_size].peft_model_id =
+          request.peft_model_id;
+      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
+      set_optimizer_tasks(
+          new_bc.requestsInfo[inference_batch_size].optimizer_tasks,
+          request.max_training_steps,
+          request.completed_training_steps,
+          request.gradient_accumulation_steps);
+      // tokens info
+      for (size_t i = request.dataset_entry_processed_tokens;
+           i < request.dataset_entry_processed_tokens + num_peft_tokens;
+           i++) {
+        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+            request.dataset[dataset_entry].first[i];
+        new_bc.tokensInfo[new_bc.num_tokens].request_index =
+            inference_batch_size;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i;
+        new_bc.num_tokens++;
+        new_bc.num_peft_tokens++;
+      }
+    }
+  }
   return new_bc;
 }
 
@@ -722,11 +1128,17 @@ BeamSearchBatchConfig
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
           if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
+            outputFile << "[Profile] guid(" << request.guid
+                       << ") llm_decoding_steps("
+                       << profile_info.llm_decoding_steps << ") latency("
+                       << std::fixed << std::setprecision(3)
+                       << (profile_info.finish_time - profile_info.start_time)
+                       << ")\n";
+            // outputFile << "end-to-end latency: " << std::fixed
+            //            << std::setprecision(3) << total_request_run_time
+            //            << std::endl;
+            // outputFile << "num decoding steps: "
+            //            << profile_info.llm_decoding_steps << std::endl;
             outputFile << "token IDs: ";
             for (int i = 0; i < request.tokens.size(); i++) {
               outputFile << request.tokens[i];
@@ -736,7 +1148,6 @@ BeamSearchBatchConfig
             }
             outputFile << std::endl;
             outputFile << output;
-
             outputFile.close();
           } else {
             std::cout << "Unable to open the output file: " << output_filepath
@@ -884,10 +1295,10 @@ BeamSearchBatchConfig
   // Step 2: Initialize new request
   for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         num_active_req++;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
@@ -901,13 +1312,13 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 0;
-        profile_info.ssm_decoding_steps = 0;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
+        profiling_requests[new_request.guid].llm_decoding_steps = 0;
+        profiling_requests[new_request.guid].ssm_decoding_steps = 0;
+        profiling_requests[new_request.guid].start_time =
+            Realm::Clock::current_time_in_microseconds();
         // init the beam search metadata per request
-        int ssm_decoding_steps = profile_info.ssm_decoding_steps;
+        int ssm_decoding_steps =
+            profiling_requests[new_request.guid].ssm_decoding_steps;
 
         new_bc.beamRequestsInfo[i].beam_size =
             spec_infer_tree_width.size() > ssm_decoding_steps
@@ -1552,7 +1963,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       new_bc.request_completed[i] = false;
-
       new_bc.requestsInfo[i].num_tokens_in_batch =
           std::min(max_prompt_load_size,
                    (int)request.initial_len -
@@ -2105,7 +2515,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
         // must in this branch.
         int layer_slot = i - processed_whole_layer_tokens;
         int layer_slot_total = treeLayers[layer_num];
-        if ((first_layer_slot == layer_slot)) {
+        if (first_layer_slot == layer_slot) {
           verifiedTree.push_back(output);
           new_committed_tokens.push_back(std::make_pair(
               input.second, committed_tokens.at(guid).at(i).second));
@@ -2297,19 +2707,34 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 }
 
 std::vector<GenerationResult>
-    FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
+    FFModel::generate(std::vector<Request> const &requests) {
   RequestManager *rm = RequestManager::get_request_manager();
-  std::vector<RequestManager::RequestGuid> guids;
-  for (int i = 0; i < prompts.size(); i++) {
-    RequestManager::RequestGuid guid =
-        rm->register_new_request(prompts.at(i), max_seq_length);
-    if (guid != RequestManager::INVALID_GUID) {
-      guids.push_back(guid);
+  // reset inference_finished flag
+  rm->set_inference_finished(false);
+  std::vector<RequestManager::RequestGuid> inf_guids, peft_guids;
+  for (int i = 0; i < requests.size(); i++) {
+    RequestManager::RequestGuid guid;
+    if (requests.at(i).req_type == RequestType::REQ_INFERENCE) {
+      guid = rm->register_new_request(requests.at(i));
+      if (guid != RequestManager::INVALID_GUID) {
+        inf_guids.push_back(guid);
+      }
+    } else {
+      guid = rm->register_new_peft_request(requests.at(i));
+      if (guid != RequestManager::INVALID_GUID) {
+        peft_guids.push_back(guid);
+      }
     }
   }
   std::vector<GenerationResult> results;
-  for (int i = 0; i < guids.size(); i++) {
-    results.push_back(rm->get_generation_result(guids[i]));
+  for (int i = 0; i < inf_guids.size(); i++) {
+    results.push_back(rm->get_generation_result(inf_guids[i]));
+  }
+  if (inf_guids.size() > 0) {
+    rm->set_inference_finished();
+  }
+  for (int i = 0; i < peft_guids.size(); i++) {
+    results.push_back(rm->get_generation_result(peft_guids[i]));
   }
   return results;
 }
@@ -2342,6 +2767,18 @@ void RequestManager::background_serving_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
+
+  auto print_timestamped_message = [](std::string const &message) {
+    auto now =
+        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - "
+              << message << std::endl;
+  };
+
+  // Print at the start of the task
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Starting background serving task.");
+
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
   {
@@ -2358,6 +2795,11 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
+
+  // Checkpoint print
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Updated models' configuration.");
+
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
     rm->serve_incr_decoding(llm);
@@ -2365,13 +2807,48 @@ void RequestManager::background_serving_task(
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+
 #ifdef FF_USE_NCCL
   llm->finish_nccl_comms();
 #endif
+
+  // Print at the end of the task
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Background serving task completed.");
+}
+
+std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
+  for (size_t i = 0; i < model->layers.size(); i++) {
+    if (model->layers[i]->layer_guid == guid) {
+      std::string layer_name(model->layers[i]->name);
+      return layer_name;
+    }
+  }
+  assert(false);
+  return "invalid_layer_name";
+}
+
+bool is_peft_operator_type(OperatorType type) {
+  switch (type) {
+    case OP_LORA:
+      return true;
+    default:
+      return false;
+  }
 }
 
 /*static*/
 void RequestManager::serve_incr_decoding(FFModel *llm) {
+
+  // Check if the model object exists
+  if (llm == nullptr) {
+    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl;
+  }
+
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;
   // Compile the llm
@@ -2419,6 +2896,9 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime);
     FutureMap fm = im->inference(llm, 0, bcf);
+    if (llm->config.enable_peft) {
+      im->peft_bwd(llm, 0, bcf);
+    }
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(std::make_pair(bcf, irf));
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index fadbf80d6d..8e5f302466 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -73,74 +73,69 @@ void RequestManager::load_batch_config_task(
 
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
-  size_t total_copy_size = 0;
-  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata,
+  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->tokens_info,
                            &(batch_config->tokensInfo),
                            sizeof(BatchConfig::tokensInfo),
                            hipMemcpyHostToDevice,
                            stream));
-  total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                               total_copy_size,
+  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->requestsInfo,
                            &(batch_config->requestsInfo),
                            sizeof(BatchConfig::requestsInfo),
                            hipMemcpyHostToDevice,
                            stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     BeamSearchBatchConfig const *beam_batch_config =
         static_cast<BeamSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamTokenInfo,
                              &(beam_batch_config->beamTokenInfo),
                              sizeof(BeamSearchBatchConfig::beamTokenInfo),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo,
                              &(beam_batch_config->beamRequestsInfo),
                              sizeof(BeamSearchBatchConfig::beamRequestsInfo),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask,
                              &(beam_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BatchConfig::causalMask);
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed,
+                             &(batch_config->request_completed),
+                             sizeof(BatchConfig::request_completed),
+                             hipMemcpyHostToDevice,
+                             stream));
+
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask,
                              &(tree_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->committed_tokens,
                              &(tree_batch_config->committed_tokens),
                              sizeof(TreeVerifyBatchConfig::committed_tokens),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-  }
 
-  // add a size check
-  assert(total_copy_size <= handle.batch_config_metadata_size);
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed,
+                             &(batch_config->request_completed),
+                             sizeof(BatchConfig::request_completed),
+                             hipMemcpyHostToDevice,
+                             stream));
+  }
 }
 
 void RequestManager::load_positions_task(
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 8380d6be73..343f1dd6e6 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -40,8 +40,21 @@ void RequestManager::load_tokens_task(
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
            BatchConfig::max_tokens_per_batch());
     printf("Got: %d tokens\n", batch_config->num_tokens);
+
+    // pid_t pid = getpid();
+    // std::string filename = "bc_" + std::to_string(pid) + ".txt";
+    // std::ofstream file(filename);
+    // if (file.is_open()) {
+    //     file << *batch_config << std::endl;
+    //     file.close();
+    //     std::cout << "String written to file: " << filename << std::endl;
+    // } else {
+    //     std::cout << "Unable to open file: " << filename << std::endl;
+    // }
+
   } else if (batch_config->num_tokens >
-             BatchConfig::max_verify_tokens_per_batch()) {
+                 BatchConfig::max_verify_tokens_per_batch() &&
+             batch_config->get_mode() != INC_DECODING_MODE) {
     printf("Warning: Speculative decoding. too many tokens in prompt, only "
            "load up to %d tokens\n",
            BatchConfig::max_verify_tokens_per_batch());
@@ -80,91 +93,69 @@ void RequestManager::load_batch_config_task(
 
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
-  size_t total_copy_size = 0;
-  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
+  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->tokens_info,
                             &(batch_config->tokensInfo),
                             sizeof(BatchConfig::tokensInfo),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                total_copy_size,
+  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->requestsInfo,
                             &(batch_config->requestsInfo),
                             sizeof(BatchConfig::requestsInfo),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     BeamSearchBatchConfig const *beam_batch_config =
         static_cast<BeamSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamTokenInfo),
-        sizeof(BeamSearchBatchConfig::beamTokenInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamRequestsInfo),
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamTokenInfo,
+                              &(beam_batch_config->beamTokenInfo),
+                              sizeof(BeamSearchBatchConfig::beamTokenInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo,
+                              &(beam_batch_config->beamRequestsInfo),
+                              sizeof(BeamSearchBatchConfig::beamRequestsInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask,
+                              &(beam_batch_config->causalMask),
+                              sizeof(BatchConfig::causalMask),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed,
+                              &(batch_config->request_completed),
+                              sizeof(BatchConfig::request_completed),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->committed_tokens),
-        sizeof(TreeVerifyBatchConfig::committed_tokens),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask,
+                              &(tree_batch_config->causalMask),
+                              sizeof(BatchConfig::causalMask),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->committed_tokens,
+                              &(tree_batch_config->committed_tokens),
+                              sizeof(TreeVerifyBatchConfig::committed_tokens),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed,
+                              &(batch_config->request_completed),
+                              sizeof(BatchConfig::request_completed),
+                              cudaMemcpyHostToDevice,
+                              stream));
   }
-
-  // add a size check
-  assert(total_copy_size <= handle.batch_config_metadata_size);
 }
 
 void RequestManager::load_positions_task(
diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp
index 0daf151d2c..56931e0dc7 100644
--- a/src/runtime/simulator.cpp
+++ b/src/runtime/simulator.cpp
@@ -82,17 +82,17 @@ Simulator::Simulator(FFModel const *model,
 
   checkCUDA(hipEventCreate(&start_event));
   checkCUDA(hipEventCreate(&end_event));
-  conv2d_meta = new Conv2DMeta(handler);
-  // linear_meta = new LinearMeta(handler, 4096);
-  pool2d_meta = new Pool2DMeta(handler);
-  ele_unary_meta = new ElementUnaryMeta(handler);
-  // ele_binary_meta = new ElementBinaryMeta(handler);
-  // embedding_meta = new EmbeddingMeta(handler);
-  //  softmax_meta = new SoftmaxMeta(handler);
-  batch_matmul_meta = new BatchMatmulMeta(handler);
-  concat_meta = new ConcatMeta(handler);
-  // dropout_meta = new DropoutMeta(handler);
-  transpose_meta = new TransposeMeta(handler);
+  // conv2d_meta = new Conv2DMeta(handler);
+  //  linear_meta = new LinearMeta(handler, 4096);
+  // pool2d_meta = new Pool2DMeta(handler);
+  // ele_unary_meta = new ElementUnaryMeta(handler);
+  //  ele_binary_meta = new ElementBinaryMeta(handler);
+  //  embedding_meta = new EmbeddingMeta(handler);
+  //   softmax_meta = new SoftmaxMeta(handler);
+  // batch_matmul_meta = new BatchMatmulMeta(handler);
+  // concat_meta = new ConcatMeta(handler);
+  //  dropout_meta = new DropoutMeta(handler);
+  // transpose_meta = new TransposeMeta(handler);
   this->machine = machine;
   segment_size = model->config.simulator_segment_size;
   max_num_segments = model->config.simulator_max_num_segments;
diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu
index b44ce1690a..056781f73d 100644
--- a/src/runtime/simulator.cu
+++ b/src/runtime/simulator.cu
@@ -81,17 +81,17 @@ Simulator::Simulator(FFModel const *model,
 
   cudaEventCreate(&start_event);
   cudaEventCreate(&end_event);
-  conv2d_meta = new Conv2DMeta(handler);
+  // conv2d_meta = new Conv2DMeta(handler);
   // linear_meta = new LinearMeta(handler, 4096);
-  pool2d_meta = new Pool2DMeta(handler);
-  ele_unary_meta = new ElementUnaryMeta(handler);
+  // pool2d_meta = new Pool2DMeta(handler);
+  // ele_unary_meta = new ElementUnaryMeta(handler);
   // ele_binary_meta = new ElementBinaryMeta(handler);
   // embedding_meta = new EmbeddingMeta(handler);
   // softmax_meta = new SoftmaxMeta(handler);
-  batch_matmul_meta = new BatchMatmulMeta(handler);
-  concat_meta = new ConcatMeta(handler);
+  // batch_matmul_meta = new BatchMatmulMeta(handler);
+  // concat_meta = new ConcatMeta(handler);
   // dropout_meta = new DropoutMeta(handler);
-  transpose_meta = new TransposeMeta(handler);
+  // transpose_meta = new TransposeMeta(handler);
   this->machine = machine;
   segment_size = model->config.simulator_segment_size;
   max_num_segments = model->config.simulator_max_num_segments;
@@ -103,13 +103,13 @@ Simulator::~Simulator(void) {
   simulatorInst.destroy();
   cudaEventDestroy(start_event);
   cudaEventDestroy(end_event);
-  delete conv2d_meta;
-  delete pool2d_meta;
-  delete ele_unary_meta;
-  delete batch_matmul_meta;
-  delete concat_meta;
-  delete transpose_meta;
-  delete task_manager;
+  // delete conv2d_meta;
+  // delete pool2d_meta;
+  // delete ele_unary_meta;
+  // delete batch_matmul_meta;
+  // delete concat_meta;
+  // delete transpose_meta;
+  // delete task_manager;
 }
 
 __host__ void
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index b86964049d..9b6510fe5e 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -43,6 +43,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -3754,14 +3755,17 @@ bool FFModel::convert_graph_to_operators(
         assert(inList.size() == 1);
         Softmax *softmax = (Softmax *)node.ptr;
         new_op = new Softmax(
-            *this, softmax->layer_guid, inputs[0], softmax->dim, NULL);
+            *this, softmax->layer_guid, inputs[0], softmax->dim, softmax->name);
         break;
       }
       case OP_COMBINE: {
         assert(inList.size() == 1);
         Combine *combine = (Combine *)node.ptr;
-        new_op = new Combine(
-            *this, inputs[0], combine->combine_dim, combine->combine_degree);
+        new_op = new Combine(*this,
+                             inputs[0],
+                             combine->combine_dim,
+                             combine->combine_degree,
+                             combine->name);
         break;
       }
       case OP_REPARTITION: {
@@ -3770,7 +3774,8 @@ bool FFModel::convert_graph_to_operators(
         new_op = new Repartition(*this,
                                  inputs[0],
                                  repart->repartition_dim,
-                                 repart->repartition_degree);
+                                 repart->repartition_degree,
+                                 repart->name);
         break;
       }
       case OP_REPLICATE: {
@@ -3779,7 +3784,8 @@ bool FFModel::convert_graph_to_operators(
         new_op = new Replicate(*this,
                                inputs[0],
                                replicate->replicate_dim,
-                               replicate->replicate_degree);
+                               replicate->replicate_degree,
+                               replicate->name);
         break;
       }
       case OP_REDUCTION: {
@@ -3788,13 +3794,24 @@ bool FFModel::convert_graph_to_operators(
         new_op = new Reduction(*this,
                                inputs[0],
                                reduction->reduction_dim,
-                               reduction->reduction_degree);
+                               reduction->reduction_degree,
+                               reduction->name);
         break;
       }
       case OP_ALLREDUCE: {
         assert(inList.size() == 1);
         AllReduce *allreduce = (AllReduce *)node.ptr;
-        new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim);
+        new_op = new AllReduce(
+            *this, inputs[0], allreduce->allreduce_dim, allreduce->name);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(inList.size() == 1);
+        ParallelIdentity *parallel_identity = (ParallelIdentity *)node.ptr;
+        new_op = new ParallelIdentity(*this,
+                                      inputs[0],
+                                      parallel_identity->parallel_identity_dim,
+                                      parallel_identity->name);
         break;
       }
       case OP_FUSED_PARALLEL: {
@@ -3819,8 +3836,9 @@ bool FFModel::convert_graph_to_operators(
                                               abr_ln->elementwise_affine,
                                               abr_ln->use_bias,
                                               abr_ln->eps,
+                                              abr_ln->inplace_residual,
                                               true,
-                                              NULL);
+                                              abr_ln->name);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
@@ -3828,7 +3846,7 @@ bool FFModel::convert_graph_to_operators(
         SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr;
         SigmoidSiluMultiParams params = ssm->get_params();
         new_op = new SigmoidSiluMulti(
-            *this, ssm->layer_guid, inputs[0], inputs[1], NULL);
+            *this, ssm->layer_guid, inputs[0], inputs[1], ssm->name);
         break;
       }
       default: {
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index 49d42bb6dd..a71b1070b2 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -54,6 +54,10 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/tests/.gitignore b/tests/.gitignore
deleted file mode 100644
index f3732d54f4..0000000000
--- a/tests/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-inference/python_test_configs/*.json
diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh
index 3fb361f25c..73b0cb30dc 100755
--- a/tests/align/test_all_operators.sh
+++ b/tests/align/test_all_operators.sh
@@ -11,7 +11,7 @@ function generate_torch_tensor(){
     python tests/align/align_create_tensor_torch.py -o "$1"
 }
 
-ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather)
+ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather)
 
 #create flexflow tensors
 conda activate flexflow
diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh
index 1e8dd4298f..c7206eac93 100755
--- a/tests/cpp_gpu_tests.sh
+++ b/tests/cpp_gpu_tests.sh
@@ -23,8 +23,8 @@ remove_mnist() {
 download_mnist() {
 	if [[ ! -f train-images-idx3-ubyte || ! -f train-labels-idx1-ubyte ]]; then
 		remove_mnist
-		wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-		wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
+		wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-images-idx3-ubyte.gz
+		wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-labels-idx1-ubyte.gz
 		gzip -d train-images-idx3-ubyte.gz
 		gzip -d train-labels-idx1-ubyte.gz
 	fi
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index 8beea55999..a9dd8809ba 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}"
 ###############################################################################################
 
 # LLAMA
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
 # LLAMA (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
 
 # OPT
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
 # OPT (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     
     # OPT
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
@@ -37,63 +37,63 @@ fi
 ###############################################################################################
 
 # LLAMA (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
 
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1
 
 # LLAMA (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
 
 # LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
 # LLAMA (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
 
 # OPT (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
 # OPT (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
 
 # OPT (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
 # OPT (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
 
 # Falcon (full precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 # Falcon (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 
 # # StarCoder (full precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
 # # StarCoder (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # LLAMA (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # LLAMA (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 
     # OPT (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # OPT (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # OPT (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index 6857b5cbc1..5e563c9974 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -77,20 +77,18 @@ def main():
 
     # Set default tensor type depending on argument indicating the float type to use
     if not args.use_full_precision:
-        torch.set_default_tensor_type(torch.HalfTensor)
-
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(torch.float32)
+    
     # Run huggingface model
     cuda_availble = torch.cuda.is_available()
     device = "cuda" if args.gpu and cuda_availble else "cpu"
     # Get Model
-    model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device)
+    model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=True).to(device)
     # Get Tokenizer
     hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
-    hf_arch = getattr(hf_config, "architectures")[0]
-    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
-        tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True)
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
     generation_config = GenerationConfig.from_pretrained(args.model_name)
     generation_config.do_sample = args.do_sample
     ################# debugging #################
diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh
index a1ee281914..a83464754f 100755
--- a/tests/inference/python_inference_tests.sh
+++ b/tests/inference/python_inference_tests.sh
@@ -84,12 +84,13 @@ function compare_decoding_steps_spec_infer_incr_decoding {
     local specInf_file="$2"
 
     # Read the number of decoding steps from the second line of the files
-    second_line=$(sed -n '2p' "$incrDec_file")
-    read -r line <<< "$second_line"
-    incrDec=${line#*: }
-    second_line=$(sed -n '2p' "$specInf_file")
-    read -r line <<< "$second_line"
-    specInf=${line#*: }
+    first_line=$(sed -n '1p' "$incrDec_file")
+    incr_dec_steps="${first_line##*llm_decoding_steps(}"
+    incr_dec_steps="${incr_dec_steps%%)*}"
+    
+    first_line=$(sed -n '1p' "$specInf_file")
+    spec_inf_steps="${first_line##*llm_decoding_steps(}"
+    spec_inf_steps="${spec_inf_steps%%)*}"
 
     if ! command -v bc &> /dev/null; then
         echo "bc is not installed. Installing..."
@@ -97,8 +98,8 @@ function compare_decoding_steps_spec_infer_incr_decoding {
     fi
     
     # Perform the comparison
-    threshold=$(bc <<< "$specInf * 1.5")
-    if (( $(echo "$incrDec >= $threshold" | bc -l) )); then
+    threshold=$(bc <<< "$spec_inf_steps * 1.5")
+    if (( $(echo "$incr_dec_steps >= $threshold" | bc -l) )); then
         #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file."
         :
     else
@@ -184,13 +185,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p
 # Falcon (full precision)
 python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128
 
-diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
-diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
-diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
 
-diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
-diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
-#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt")
-diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt")
+diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
+#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt")
+diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt")
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 41703cf431..0a745c7984 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -14,9 +14,12 @@
     "tensor_parallelism_degree": 1,
     "pipeline_parallelism_degree": 4,
     "offload": False,
-    "offload_reserve_space_size": 1024**2,
+    "offload_reserve_space_size": 8 * 1024, # 8 GB
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
+    "enable_peft": False,
+    "peft_activation_reserve_space_size": 1024, # 1GB
+    "peft_weight_reserve_space_size": 1024, # 1GB
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
new file mode 100644
index 0000000000..93727bdc89
--- /dev/null
+++ b/tests/peft/alignment/align_test_utils.py
@@ -0,0 +1,510 @@
+import os, re, torch
+import numpy as np
+from typing import List
+from enum import Enum
+from dataclasses import dataclass
+
+abs_dirname = os.path.dirname(os.path.abspath(__file__))
+cache_folder = os.path.expanduser(os.getenv("FF_CACHE_PATH", "~/.cache/flexflow"))
+hf_path = os.path.join(cache_folder, "debug/huggingface")
+ff_path = os.path.join(cache_folder, "debug/flexflow")
+
+
+def print_unique_files_list(dirname):
+    files_list = os.listdir(dirname)
+    for f in sorted(files_list):
+        match = re.search(r"layers.\d+", f)
+        if match:
+            if "layers." in match[0]:
+                layer_num = int(match[0].split(".")[1])
+                if layer_num > 0:
+                    files_list.remove(f)
+            elif "layers_" in match[0]:
+                layer_num = int(match[0].split("_")[1])
+                if layer_num > 0 and layer_num != 100:
+                    files_list.remove(f)
+    return sorted(files_list)
+
+
+def compare_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, tolerance=1e-2):
+    """Check whether a HuggingFace tensor and a FlexFlow tensor are equal
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor_filepath (str): The file path of the FlexFlow tensor
+        tolerance (float, optional): Floating-point error tolerance for the checks. Defaults to 1e-2.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(hf_tensor_filepath):
+        raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found")
+    if not os.path.exists(ff_tensor_filepath):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor_filepath} not found")
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert len(hf_tensor) == 1
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",")
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor = ff_tensor[:len_hf_tensor]
+
+    mismatches = []
+    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):
+        print(f"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]
+        print(mismatches)
+        # print(np.nonzero(hf_tensor)[0])
+        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])
+        # print(ff_tensor[36], hf_tensor[36])
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len_hf_tensor
+    print("Ok!")
+
+
+def compare_tensors_difference(
+    hf_tensor_filepath: str,
+    ff_tensor1_filepath: str,
+    ff_tensor2_filepath: str,
+    tolerance: float = 1e-2,
+):
+    """Check whether a HuggingFace tensor is equal to the difference between two FlexFlow tensors
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor1_filepath (str): The file path of the first FlexFlow tensor
+        ff_tensor2_filepath (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): The floating-point error tolerance for the equality check. Defaults to 1e-2.
+    """
+    assert os.path.exists(hf_tensor_filepath)
+    assert os.path.exists(ff_tensor1_filepath)
+    assert os.path.exists(ff_tensor2_filepath)
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert len(hf_tensor) == 1
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=",")
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor1 = ff_tensor1[:len_hf_tensor]
+    ff_tensor2 = ff_tensor2[:len_hf_tensor]
+    ff_tensor = ff_tensor1 - ff_tensor2
+
+    mismatches = []
+    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):
+        print(
+            f"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}"
+        )
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]
+        print(mismatches)
+        # print(np.nonzero(hf_tensor)[0])
+        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])
+        # print(ff_tensor[36], hf_tensor[36])
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len_hf_tensor
+    print("Ok!")
+
+
+def compare_hf_tensors(tensor1_fp: str, tensor2_fp: str):
+    """Checks whether two HuggingFace tensors are equal
+
+    Args:
+        tensor1_fp (str): The file path of the first tensor
+        tensor2_fp (str): The file path of the second tensor
+    """
+    if not os.path.exists(tensor1_fp):
+        raise FileNotFoundError(f"HF tensor file: {tensor1_fp} not found")
+    if not os.path.exists(tensor2_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found")
+    hf_tensor1 = torch.load(tensor1_fp)
+    hf_tensor2 = torch.load(tensor2_fp)
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert len(hf_tensor1) == 1
+        hf_tensor1 = hf_tensor1[0]
+    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:
+        assert len(hf_tensor2) == 1
+        hf_tensor2 = hf_tensor2[0]
+    assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape
+    hf_tensor1 = torch.nan_to_num(hf_tensor1)
+    hf_tensor2 = torch.nan_to_num(hf_tensor2)
+    if not (
+        np.allclose(
+            hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()
+        )
+    ):
+        print(f"mismatch between {tensor1_fp} and {tensor2_fp}")
+        print(hf_tensor1)
+        print(hf_tensor2)
+        print(
+            np.isclose(
+                hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()
+            )
+        )
+        mismatches = np.where(
+            ~np.isclose(
+                hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()
+            )
+        )[0]
+        print(mismatches)
+        assert False
+    print("Ok!")
+
+
+def check_hf_sum_tensors(tensor_sum_fp: str, tensor1_fp: str, tensor2_fp: str):
+    """Checks whether a HuggingFace tensor is equal to the sum of two other HuggingFace tensors
+
+    Args:
+        tensor_sum_fp (str): The file path of the sum tensor
+        tensor1_fp (str): The file path of the first tensor
+        tensor2_fp (str): The file path of the second tensor
+    """
+    if not os.path.exists(tensor_sum_fp):
+        raise FileNotFoundError(f"HF tensor file: {tensor_sum_fp} not found")
+    if not os.path.exists(tensor1_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor1_fp} not found")
+    if not os.path.exists(tensor2_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found")
+    hf_tensor_sum = torch.load(tensor_sum_fp)
+    hf_tensor1 = torch.load(tensor1_fp)
+    hf_tensor2 = torch.load(tensor2_fp)
+    if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:
+        assert len(hf_tensor_sum) == 1
+        hf_tensor_sum = hf_tensor_sum[0]
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert len(hf_tensor1) == 1
+        hf_tensor1 = hf_tensor1[0]
+    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:
+        assert len(hf_tensor2) == 1
+        hf_tensor2 = hf_tensor2[0]
+    assert torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape
+    assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape
+    hf_tensor1 = torch.nan_to_num(hf_tensor1)
+    hf_tensor2 = torch.nan_to_num(hf_tensor2)
+    hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)
+    sum_check_tensor = hf_tensor1 + hf_tensor2
+    if not (
+        np.allclose(
+            sum_check_tensor.detach().cpu().numpy(),
+            hf_tensor_sum.detach().cpu().numpy(),
+        )
+    ):
+        print(f"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}")
+        print(tensor_sum_fp)
+        print(sum_check_tensor)
+        print(hf_tensor1)
+        print(hf_tensor2)
+        print(
+            np.isclose(
+                sum_check_tensor.detach().cpu().numpy(),
+                hf_tensor_sum.detach().cpu().numpy(),
+            )
+        )
+        mismatches = np.where(
+            ~np.isclose(
+                sum_check_tensor.detach().cpu().numpy(),
+                hf_tensor_sum.detach().cpu().numpy(),
+            )
+        )[0]
+        print(mismatches)
+        assert False
+    print("Ok!")
+
+
+def check_hf_zero_tensor(hf_tensor_fp: str):
+    """Check whether a HuggingFace tensor is a zero tensor
+
+    Args:
+        hf_tensor_fp (str): The file path of the HuggingFace tensor
+    """
+    if not os.path.exists(hf_tensor_fp):
+        raise FileNotFoundError(f"HF tensor file: {hf_tensor_fp} not found")
+    hf_tensor1 = torch.load(hf_tensor_fp)
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert len(hf_tensor1) == 1
+        hf_tensor1 = hf_tensor1[0]
+    assert torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0
+
+
+def print_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, txt: str = ""):
+    """Print the contents of a HuggingFace tensor and a FlexFlow tensor
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor_filepath (str): The file path of the FlexFlow tensor
+        txt (str, optional): Additional text to prepend to the tensors. Defaults to "".
+    """
+    assert os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert len(hf_tensor) == 1
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",")
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor = ff_tensor[:len_hf_tensor]
+
+    print(f"{txt} - HF tensor:")
+    print(hf_tensor)
+    print(f"{txt} - FF tensor: ")
+    print(ff_tensor)
+
+
+def compare_flexflow_tensors(
+    ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5, max_len: int = -1
+):
+    """Check whether two FlexFlow tensors are equal
+
+    Args:
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): Floating-point error tolernace for the check. Defaults to 1e-5.
+        max_len (int, optional): Maximum number of elements to check (if > 0). Defaults to -1.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
+    assert os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",")
+
+    if ff_tensor1.shape != ff_tensor2.shape:
+        print(ff_tensor1.shape, ff_tensor2.shape)
+    assert ff_tensor1.shape == ff_tensor2.shape
+
+    if max_len > -1:
+        ff_tensor1 = ff_tensor1[:max_len]
+        ff_tensor2 = ff_tensor2[:max_len]
+
+    mismatches = []
+    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):
+        print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]
+        print(mismatches)
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len(ff_tensor1)
+    print("Ok!")
+
+
+def compare_flexflow_tensors_shortest(
+    ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5
+):
+    """Compare two FlexFlow tensors up to the maximum length of the shortest tensor
+
+    Args:
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): Floating point error tolerance for the check. Defaults to 1e-5.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",")
+    minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])
+    ff_tensor1 = ff_tensor1[:minlen]
+    ff_tensor2 = ff_tensor2[:minlen]
+    mismatches = []
+    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):
+        print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]
+        print(mismatches)
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len(ff_tensor1)
+    print("Ok!")
+
+
+def check_flexflow_tensors_sum(
+    ff_tensor_sum_fp: str, ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance=1e-5
+):
+    """Check whether a FlexFlow tensor is equal to the sum of two other FlexFlow tensors
+
+    Args:
+        ff_tensor_sum_fp (str): The file path of the FlexFlow sum tensor
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (_type_, optional): Floating-point error tolerance for the check. Defaults to 1e-5.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",")
+    ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=",")
+
+    ff_sum = ff_tensor1 + ff_tensor2
+    assert ff_tensor1.shape == ff_tensor2.shape
+
+    mismatches = []
+    if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):
+        print(
+            f"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}"
+        )
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(f"Sum Tensor: {ff_tensor_sum}\nActual sum:{ff_sum}")
+        print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]
+        print(mismatches)
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len(ff_tensor1)
+    print("Ok!")
+
+
+def load_ff_tensor(filename: str, shape: List[int]):
+    """Load a FlexFlow tensor from a file as a numpy array
+
+    Args:
+        filename (str): The file path of the FF tensor
+        shape (List[int]): The shape of the FF tensor
+
+    Returns:
+        _type_: The FF tensor as a numpy array
+    """
+    if ff_path not in filename:
+        filename = os.path.join(ff_path, filename)
+    ff_tensor = np.loadtxt(filename, delimiter=",").reshape(shape, order="F")
+    return ff_tensor
+
+
+def load_hf_tensor(filename: str):
+    """Load a HuggingFace tensor from a file as a numpy array
+
+    Args:
+        filename (str): The file path of the HF tensor
+
+    Returns:
+        _type_: The HF tensor as a numpy array
+    """
+    if hf_path not in filename:
+        filename = os.path.join(hf_path, filename)
+    hf_tensor = torch.load(filename)
+    hf_tensor = hf_tensor.detach().cpu().numpy()
+    return hf_tensor
+
+
+def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2):
+    """Check whether a Huggingface and a FlexFlow tensors, both loaded to memory in the form of a numpy array, are equal
+
+    Args:
+        hf_tensor (_type_): The HuggingFace tensor (in numpy array form)
+        ff_tensor (_type_): The FlexFlow tensor (in numpy array form)
+        tolerance (_type_, optional): The floating point error tolerance for the check. Defaults to 1e-2.
+    """
+    assert hf_tensor.shape == ff_tensor.shape
+    mismatches = []
+    if not np.allclose(hf_tensor, ff_tensor, atol=tolerance):
+        print(f"mismatch between hf_tensor and ff_tensor")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(hf_tensor, ff_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+        print(mismatches)
+    len_hf_tensor = hf_tensor.flatten().shape[0]
+    assert len(mismatches) <= 0.05 * len_hf_tensor
+    print("Ok!")
+
+
+def are_np_arrays_identical(*np_arrays):
+    if len(np_arrays) < 2:
+        return True
+
+    first = np_arrays[0]
+
+    # Check shapes and dtypes
+    if not all(
+        t.shape == first.shape and t.dtype == first.dtype for t in np_arrays[1:]
+    ):
+        return False
+
+    # Stack all tensors along a new axis
+    stacked = np.stack(np_arrays)
+
+    # Check if all elements along the new axis are equal
+    return np.all(stacked == stacked[0])
+
+
+class TPType(Enum):
+    REPLICATE = 0
+    PARTITION = 1
+    TO_REDUCE = 2
+
+
+@dataclass
+class TensorComparisonIdxs:
+    hf_tensor_type: str
+    ff_tensor_type: str
+    hf_tensor_idx: int
+    ff_tensor_idx: int
+
+
+def replace_value(lst, old_value, new_value):
+    occurrences = lst.count(old_value)
+    if occurrences == 0:
+        raise ValueError(f"Value {old_value} not found in the list.")
+    elif occurrences > 1:
+        raise ValueError(f"Multiple instances of {old_value} found in the list.")
+    else:
+        index = lst.index(old_value)
+        lst[index] = new_value
+        return lst
+
+
+def truncate_dimension(tensor, old_dim, new_dim):
+    # Check if old_dim appears exactly once in the tensor's shape
+    shape = tensor.shape
+    dim_occurrences = shape.count(old_dim)
+
+    if dim_occurrences == 0:
+        raise ValueError(f"Dimension {old_dim} not found in the tensor shape.")
+    elif dim_occurrences > 1:
+        raise ValueError(
+            f"Multiple instances of dimension {old_dim} found in the tensor shape."
+        )
+
+    # Check if new_dim is less than or equal to old_dim
+    if new_dim > old_dim:
+        raise ValueError(
+            f"New dimension ({new_dim}) must be less than or equal to old dimension ({old_dim})."
+        )
+
+    # Find the index of the dimension to truncate
+    dim_index = shape.index(old_dim)
+
+    # Create a slice object for truncation
+    slices = [slice(None)] * len(shape)
+    slices[dim_index] = slice(0, new_dim)
+
+    # Truncate the tensor
+    truncated_tensor = tensor[tuple(slices)]
+
+    return truncated_tensor
diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb
new file mode 100644
index 0000000000..86a4ef76c4
--- /dev/null
+++ b/tests/peft/alignment/llama_alignment_tests.ipynb
@@ -0,0 +1,2651 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch\n",
+    "from align_test_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/usr/FlexFlow/tests/peft/hf_peft_tensors /usr/FlexFlow/build/inference_tensors\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(hf_path, ff_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check weights (semi-automatically)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def convert_hf_filename_to_ff_filename(f, num_layers=12):\n",
+    "    if f.endswith(\".lm_head.weight\"):\n",
+    "        f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n",
+    "    elif f == \"norm.weight\":\n",
+    "        f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n",
+    "    else:\n",
+    "        f_version = \"fwd_step_0_\"\n",
+    "        if f.startswith(\"layers.\"):\n",
+    "            layernum = f.split(\"layers.\")[1].split(\".\")[0]\n",
+    "            f_version += f\"layers_{layernum}_\"\n",
+    "        f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n",
+    "        weight_index=\"0\"\n",
+    "        if \"lora_A\" in f_version:\n",
+    "            weight_index=\"A\"\n",
+    "        elif \"lora_B\" in f_version:\n",
+    "            weight_index=\"B\"\n",
+    "        f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n",
+    "        f_version += f\"_shard_0_weight_{weight_index}\"\n",
+    "    return f_version\n",
+    "\n",
+    "files_list = os.listdir(hf_path)\n",
+    "num_layers=12\n",
+    "for f in sorted(files_list):\n",
+    "    if f.endswith(\".weight\"):\n",
+    "        if \"self_attn\" in f:\n",
+    "            continue\n",
+    "        f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)\n",
+    "        # print(f, f_version)\n",
+    "        hf_w_path = os.path.join(hf_path, f)\n",
+    "        ff_w_path = os.path.join(ff_path, f_version)\n",
+    "        assert(os.path.isfile(hf_w_path))\n",
+    "        assert(os.path.isfile(ff_w_path))\n",
+    "        # print(\"\\t\", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))\n",
+    "        # print(\"\\t\", ff_w_path)\n",
+    "\n",
+    "        # check equivalence\n",
+    "        compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load model for automatic check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "use_full_precision=True\n",
+    "peft_model_id=\"goliaro/llama-160m-lora\"\n",
+    "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "if peft_config.peft_type != \"LORA\":\n",
+    "    raise ValueError(f\"PEFT type {peft_config.peft_type} not supported yet\")\n",
+    "\n",
+    "peft_config.init_lora_weights = (\n",
+    "    False\n",
+    ")  # prevent HF from re-inizialing the weights randomly\n",
+    "model_name = peft_config.base_model_name_or_path\n",
+    "# Load base model, and apply the PEFT layer\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    torch_dtype=torch.float32 if use_full_precision else torch.float16,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "embed_tokens True True\n",
+      "layers.0.self_attn.q_proj True True\n",
+      "layers.0.self_attn.k_proj True True\n",
+      "layers.0.self_attn.v_proj True True\n",
+      "layers.0.self_attn.o_proj True True\n",
+      "layers.0.self_attn.rotary_emb True True\n",
+      "layers.0.mlp.gate_proj True True\n",
+      "layers.0.mlp.up_proj True True\n",
+      "layers.0.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.0.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_output_0\n",
+      "layers.0.input_layernorm True True\n",
+      "layers.0.post_attention_layernorm True True\n",
+      "layers.1.self_attn.q_proj True True\n",
+      "layers.1.self_attn.k_proj True True\n",
+      "layers.1.self_attn.v_proj True True\n",
+      "layers.1.self_attn.o_proj True True\n",
+      "layers.1.self_attn.rotary_emb True True\n",
+      "layers.1.mlp.gate_proj True True\n",
+      "layers.1.mlp.up_proj True True\n",
+      "layers.1.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.1.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_output_0\n",
+      "layers.1.input_layernorm True True\n",
+      "layers.1.post_attention_layernorm True True\n",
+      "layers.2.self_attn.q_proj True True\n",
+      "layers.2.self_attn.k_proj True True\n",
+      "layers.2.self_attn.v_proj True True\n",
+      "layers.2.self_attn.o_proj True True\n",
+      "layers.2.self_attn.rotary_emb True True\n",
+      "layers.2.mlp.gate_proj True True\n",
+      "layers.2.mlp.up_proj True True\n",
+      "layers.2.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.2.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_output_0\n",
+      "layers.2.input_layernorm True True\n",
+      "layers.2.post_attention_layernorm True True\n",
+      "layers.3.self_attn.q_proj True True\n",
+      "layers.3.self_attn.k_proj True True\n",
+      "layers.3.self_attn.v_proj True True\n",
+      "layers.3.self_attn.o_proj True True\n",
+      "layers.3.self_attn.rotary_emb True True\n",
+      "layers.3.mlp.gate_proj True True\n",
+      "layers.3.mlp.up_proj True True\n",
+      "layers.3.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.3.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_output_0\n",
+      "layers.3.input_layernorm True True\n",
+      "layers.3.post_attention_layernorm True True\n",
+      "layers.4.self_attn.q_proj True True\n",
+      "layers.4.self_attn.k_proj True True\n",
+      "layers.4.self_attn.v_proj True True\n",
+      "layers.4.self_attn.o_proj True True\n",
+      "layers.4.self_attn.rotary_emb True True\n",
+      "layers.4.mlp.gate_proj True True\n",
+      "layers.4.mlp.up_proj True True\n",
+      "layers.4.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.4.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_output_0\n",
+      "layers.4.input_layernorm True True\n",
+      "layers.4.post_attention_layernorm True True\n",
+      "layers.5.self_attn.q_proj True True\n",
+      "layers.5.self_attn.k_proj True True\n",
+      "layers.5.self_attn.v_proj True True\n",
+      "layers.5.self_attn.o_proj True True\n",
+      "layers.5.self_attn.rotary_emb True True\n",
+      "layers.5.mlp.gate_proj True True\n",
+      "layers.5.mlp.up_proj True True\n",
+      "layers.5.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.5.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_output_0\n",
+      "layers.5.input_layernorm True True\n",
+      "layers.5.post_attention_layernorm True True\n",
+      "layers.6.self_attn.q_proj True True\n",
+      "layers.6.self_attn.k_proj True True\n",
+      "layers.6.self_attn.v_proj True True\n",
+      "layers.6.self_attn.o_proj True True\n",
+      "layers.6.self_attn.rotary_emb True True\n",
+      "layers.6.mlp.gate_proj True True\n",
+      "layers.6.mlp.up_proj True True\n",
+      "layers.6.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.6.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_output_0\n",
+      "layers.6.input_layernorm True True\n",
+      "layers.6.post_attention_layernorm True True\n",
+      "layers.7.self_attn.q_proj True True\n",
+      "layers.7.self_attn.k_proj True True\n",
+      "layers.7.self_attn.v_proj True True\n",
+      "layers.7.self_attn.o_proj True True\n",
+      "layers.7.self_attn.rotary_emb True True\n",
+      "layers.7.mlp.gate_proj True True\n",
+      "layers.7.mlp.up_proj True True\n",
+      "layers.7.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.7.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_output_0\n",
+      "layers.7.input_layernorm True True\n",
+      "layers.7.post_attention_layernorm True True\n",
+      "layers.8.self_attn.q_proj True True\n",
+      "layers.8.self_attn.k_proj True True\n",
+      "layers.8.self_attn.v_proj True True\n",
+      "layers.8.self_attn.o_proj True True\n",
+      "layers.8.self_attn.rotary_emb True True\n",
+      "layers.8.mlp.gate_proj True True\n",
+      "layers.8.mlp.up_proj True True\n",
+      "layers.8.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.8.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_output_0\n",
+      "layers.8.input_layernorm True True\n",
+      "layers.8.post_attention_layernorm True True\n",
+      "layers.9.self_attn.q_proj True True\n",
+      "layers.9.self_attn.k_proj True True\n",
+      "layers.9.self_attn.v_proj True True\n",
+      "layers.9.self_attn.o_proj True True\n",
+      "layers.9.self_attn.rotary_emb True True\n",
+      "layers.9.mlp.gate_proj True True\n",
+      "layers.9.mlp.up_proj True True\n",
+      "layers.9.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.9.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_output_0\n",
+      "layers.9.input_layernorm True True\n",
+      "layers.9.post_attention_layernorm True True\n",
+      "layers.10.self_attn.q_proj True True\n",
+      "layers.10.self_attn.k_proj True True\n",
+      "layers.10.self_attn.v_proj True True\n",
+      "layers.10.self_attn.o_proj True True\n",
+      "layers.10.self_attn.rotary_emb True True\n",
+      "layers.10.mlp.gate_proj True True\n",
+      "layers.10.mlp.up_proj True True\n",
+      "layers.10.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.10.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_output_0\n",
+      "layers.10.input_layernorm True True\n",
+      "layers.10.post_attention_layernorm True True\n",
+      "layers.11.self_attn.q_proj True True\n",
+      "layers.11.self_attn.k_proj True True\n",
+      "layers.11.self_attn.v_proj True True\n",
+      "layers.11.self_attn.o_proj True True\n",
+      "layers.11.self_attn.rotary_emb True True\n",
+      "layers.11.mlp.gate_proj True True\n",
+      "layers.11.mlp.up_proj True True\n",
+      "layers.11.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.11.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_output_0\n",
+      "layers.11.input_layernorm True True\n",
+      "layers.11.post_attention_layernorm True True\n",
+      "norm True True\n",
+      "lm_head True True\n"
+     ]
+    }
+   ],
+   "source": [
+    "named_modules_ = [\n",
+    "    name.replace(\"base_model.model.model.\", \"\").replace(\"base_model.model.model\", \"\").replace(\"base_model.model.\", \"\").replace(\"base_model.model\", \"\").replace(\"base_model.\", \"\").replace(\"base_model\", \"\")\n",
+    "    for name, _ in model.named_modules()\n",
+    "]\n",
+    "\n",
+    "def remove_prefixes(named_modules):\n",
+    "    i = 0\n",
+    "    while i < len(named_modules) - 1:\n",
+    "        if named_modules[i + 1].startswith(named_modules[i]):\n",
+    "            named_modules.pop(i)\n",
+    "        else:\n",
+    "            i += 1\n",
+    "    return named_modules\n",
+    "named_modules = remove_prefixes(named_modules_)\n",
+    "\n",
+    "def convert_hf_module_name_to_ff_filenames(n, num_layers=12):\n",
+    "    if n == \"embed_tokens\":\n",
+    "        ff_in_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n",
+    "        ff_out_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n",
+    "    elif n == \"lm_head\" or n == \"norm\":\n",
+    "        ff_in_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_input_0\"\n",
+    "        ff_out_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_output_0\"\n",
+    "    elif n.startswith(\"layers.\"):\n",
+    "        layernum = n.split(\"layers.\")[1].split(\".\")[0]\n",
+    "        ff_in_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_input_0\"\n",
+    "        ff_out_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_output_0\"\n",
+    "    else:\n",
+    "        assert False, f\"Module {n} not supported yet\"\n",
+    "    return os.path.join(ff_path, ff_in_name), os.path.join(ff_path, ff_out_name)\n",
+    "\n",
+    "# Compute the hf path, check if the input and output are there\n",
+    "for n in named_modules:\n",
+    "    in_name = f\"fwd_step_0_{n}.input_0\"\n",
+    "    out_name = f\"fwd_step_0_{n}.output_0\"\n",
+    "    if n == \"lm_head\":\n",
+    "        in_name = f\"fwd_step_0_base_model.model.{n}.input_0\"\n",
+    "        out_name = f\"fwd_step_0_base_model.model.{n}.output_0\"\n",
+    "    hf_mod_in = os.path.join(hf_path, in_name)\n",
+    "    hf_mod_out = os.path.join(hf_path, out_name)\n",
+    "    check = os.path.exists(hf_mod_in) and os.path.exists(hf_mod_out)\n",
+    "    \n",
+    "    check2=True\n",
+    "    if \"self_attn\" not in n:\n",
+    "        ff_mod_in, ff_mod_out = convert_hf_module_name_to_ff_filenames(n, num_layers=num_layers)\n",
+    "        check2 = os.path.exists(ff_mod_in) and os.path.exists(ff_mod_out)\n",
+    "    print(n, check, check2)\n",
+    "    if not check2:\n",
+    "        print(\"\\t\", ff_mod_in, ff_mod_out)\n",
+    "    # print(n, check)\n",
+    "    # print(\"\\t\", )\n",
+    "    \n",
+    "\n",
+    "# Compute the corresponding ff path, check if the input and output are there\n",
+    "\n",
+    "# for x in named_modules:\n",
+    "#     print(x)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'down_proj'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.peft_config['default'].target_modules)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manual check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "hf_embed_input= \"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.input_0\"\n",
+    "ff_embed_input=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n",
+    "compare_tensors(hf_embed_input, ff_embed_input)\n",
+    "hf_embed_output=\"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.output_0\"\n",
+    "ff_embed_output=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n",
+    "compare_tensors(hf_embed_output, ff_embed_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.10.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.input_layernorm_shard_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.06630182  6.3429456\n",
+      " -0.21220279]\n",
+      "FF:[ 0.          0.          0.         ...  0.06630275  6.34293985\n",
+      " -0.21219885]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[15889]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.input_layernorm_shard_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.14172177  9.79423\n",
+      " -6.2940273 ]\n",
+      "FF:[ 0.          0.          0.         ...  0.14172006  9.79421902\n",
+      " -6.29402065]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 2878  3206  3367  3607  5183  5346  6257  6544  7466  7679  7805  8119\n",
+      "  8159  8911  9450  9897 13696 13938 14058 14599 15126 15839 16128 16195]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for i in range(tot_num_layers):\n",
+    "    hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n",
+    "    ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n",
+    "    if i > 0:\n",
+    "        ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
+    "    compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)\n",
+    "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
+    "    if i > 0:\n",
+    "        ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)\n",
+    "    hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n",
+    "    ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n",
+    "    compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)\n",
+    "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)\n",
+    "    # w1\n",
+    "    hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
+    "    ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)\n",
+    "    # w3\n",
+    "    hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n",
+    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)\n",
+    "    # w2\n",
+    "    hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n",
+    "    hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n",
+    "    ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)\n",
+    "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-- LM head --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Final Norm --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "# ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "print(\"-- LM head --\")\n",
+    "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "# compare weights\n",
+    "hf_lm_head_weight = f\"{hf_path}/base_model.model.lm_head.weight\"\n",
+    "ff_lm_head_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0\"\n",
+    "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "# # Manually check the matmul\n",
+    "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n",
+    "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n",
+    "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n",
+    "# print(ff_tensor_out.shape)\n",
+    "# print(ff_weight.shape)\n",
+    "# print(np.matmul(ff_weight, ff_tensor_out))\n",
+    "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n",
+    "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "print(\"-- Final Norm --\")\n",
+    "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "ff_BWD_norm_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0\"\n",
+    "hf_FWD_norm_weight = f\"{hf_path}/norm.weight\"\n",
+    "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
+    "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "class LlamaRotaryEmbedding(nn.Module):\n",
+    "    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.dim = dim\n",
+    "        self.max_position_embeddings = max_position_embeddings\n",
+    "        self.base = base\n",
+    "        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n",
+    "        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n",
+    "\n",
+    "        # Build here to make `torch.jit.trace` work.\n",
+    "        self._set_cos_sin_cache(\n",
+    "            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n",
+    "        )\n",
+    "\n",
+    "    def _set_cos_sin_cache(self, seq_len, device, dtype):\n",
+    "        self.max_seq_len_cached = seq_len\n",
+    "        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n",
+    "\n",
+    "        freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n",
+    "        # Different from paper, but it uses a different permutation in order to obtain the same calculation\n",
+    "        emb = torch.cat((freqs, freqs), dim=-1)\n",
+    "        self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n",
+    "        self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n",
+    "\n",
+    "    def forward(self, x, seq_len=None):\n",
+    "        # x: [bs, num_attention_heads, seq_len, head_size]\n",
+    "        if seq_len > self.max_seq_len_cached:\n",
+    "            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n",
+    "\n",
+    "        return (\n",
+    "            self.cos_cached[:seq_len].to(dtype=x.dtype),\n",
+    "            self.sin_cached[:seq_len].to(dtype=x.dtype),\n",
+    "        )\n",
+    "def rotate_half(x):\n",
+    "    \"\"\"Rotates half the hidden dims of the input.\"\"\"\n",
+    "    x1 = x[..., : x.shape[-1] // 2] # first half\n",
+    "    x2 = x[..., x.shape[-1] // 2 :] # second half\n",
+    "    return torch.cat((x2, -x1), dim=-1)\n",
+    "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n",
+    "    \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n",
+    "\n",
+    "    Args:\n",
+    "        q (`torch.Tensor`): The query tensor.\n",
+    "        k (`torch.Tensor`): The key tensor.\n",
+    "        cos (`torch.Tensor`): The cosine part of the rotary embedding.\n",
+    "        sin (`torch.Tensor`): The sine part of the rotary embedding.\n",
+    "        position_ids (`torch.Tensor`):\n",
+    "            The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n",
+    "            used to pass offsetted position ids when working with a KV-cache.\n",
+    "        unsqueeze_dim (`int`, *optional*, defaults to 1):\n",
+    "            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n",
+    "            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n",
+    "            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n",
+    "            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n",
+    "            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n",
+    "            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n",
+    "    Returns:\n",
+    "        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n",
+    "    \"\"\"\n",
+    "    cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    q_embed = (q * cos) + (rotate_half(q) * sin)\n",
+    "    k_embed = (k * cos) + (rotate_half(k) * sin)\n",
+    "    return q_embed, k_embed\n",
+    "head_dim = 64\n",
+    "max_position_embeddings = 2048\n",
+    "rope_theta=10_000\n",
+    "kv_seq_len = 24\n",
+    "rotary_emb = LlamaRotaryEmbedding(\n",
+    "    head_dim,\n",
+    "    max_position_embeddings=max_position_embeddings,\n",
+    "    base=rope_theta,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43525000e+03 -6.48986062e+05  1.17611250e+05 ...  2.14103413e+01\n",
+      "  1.20965385e+01  3.64246368e+00]\n",
+      "[False  True  True ...  True  True  True]\n",
+      "[   0  162  185  308  339  745  747  820  830  909  933  968 1008 1156\n",
+      " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n",
+      " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n",
+      " 2888 2894 2937 3024]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43525000e+03 -6.48986062e+05  1.17611250e+05 ...  2.14103413e+01\n",
+      "  1.20965385e+01  3.64246368e+00]\n",
+      "[False  True  True ...  True  True  True]\n",
+      "[   0  162  185  308  339  745  747  820  830  909  933  968 1008 1156\n",
+      " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n",
+      " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n",
+      " 2888 2894 2937 3024]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_attention_shard_0_o_proj_in_grad\n",
+      "HF: [ 1.2223595e+06 -2.6348565e+06 -5.0760525e+05 ...  6.8275871e+01\n",
+      " -5.8116108e+01  9.5347488e+01]\n",
+      "FF:[ 1.22235925e+06 -2.63485625e+06 -5.07605000e+05 ...  6.82758865e+01\n",
+      " -5.81161423e+01  9.53475494e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 51  77  95 168 175 232 725]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 1.22235950e+06  9.93645859e+01 -2.82157593e+01 ... -3.94578514e+01\n",
+      "  -1.98409653e+01 -1.33438044e+01]\n",
+      " [-2.63485650e+06 -1.13461929e+02  1.14223976e+02 ...  7.52578735e+01\n",
+      "   1.33362747e+02  6.78501587e+01]\n",
+      " [-5.07605250e+05  4.34111862e+01  8.10619354e+01 ...  4.70537224e+01\n",
+      "   4.02149696e+01  6.98045502e+01]\n",
+      " ...\n",
+      " [ 3.02792250e+06  3.31295319e+02  9.98417091e+00 ...  4.90895653e+01\n",
+      "   9.71413574e+01  6.82758713e+01]\n",
+      " [-3.64456375e+06 -2.43692596e+02 -6.85474396e+00 ... -3.71503868e+01\n",
+      "  -1.34136658e+01 -5.81161079e+01]\n",
+      " [ 3.31921500e+06  2.24193970e+02 -6.64005566e+00 ...  2.11662292e+00\n",
+      "   3.37400856e+01  9.53474884e+01]]\n",
+      "FF:[[ 1.22235925e+06  9.93645630e+01 -2.82157211e+01 ... -3.94577713e+01\n",
+      "  -1.98408775e+01 -1.33438234e+01]\n",
+      " [-2.63485625e+06 -1.13461960e+02  1.14224037e+02 ...  7.52577744e+01\n",
+      "   1.33362701e+02  6.78501205e+01]\n",
+      " [-5.07605000e+05  4.34111404e+01  8.10619278e+01 ...  4.70536804e+01\n",
+      "   4.02149124e+01  6.98045578e+01]\n",
+      " ...\n",
+      " [ 3.02792250e+06  3.31295227e+02  9.98412323e+00 ...  4.90895386e+01\n",
+      "   9.71413727e+01  6.82758865e+01]\n",
+      " [-3.64456400e+06 -2.43692627e+02 -6.85472488e+00 ... -3.71504822e+01\n",
+      "  -1.34137001e+01 -5.81161423e+01]\n",
+      " [ 3.31921500e+06  2.24193970e+02 -6.64004517e+00 ...  2.11670875e+00\n",
+      "   3.37400322e+01  9.53475494e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 51  77  95 168 175 232 725]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 1.2223588e+06 -2.6348530e+06 -5.0760291e+05 ...  3.0279325e+06\n",
+      "  -3.6445672e+06  3.3192180e+06]\n",
+      " [-4.2496326e+02  1.1576636e+03  9.8397858e+02 ...  1.6480791e+03\n",
+      "  -5.9697235e+02  6.2627173e+02]\n",
+      " [-2.2012039e+01  6.6097900e+01  3.9933994e+01 ...  5.7103355e+01\n",
+      "  -1.5968766e+01  3.6536639e+00]\n",
+      " ...\n",
+      " [-1.2302110e+00  5.3052688e+00  2.1982718e+00 ...  1.3990868e+00\n",
+      "  -5.5132383e-01  4.8985812e-01]\n",
+      " [-1.0771493e+00  6.9571300e+00  2.7373023e+00 ...  4.9663010e+00\n",
+      "  -9.9705428e-01  2.1829298e+00]\n",
+      " [-5.9534687e-01  3.0272012e+00  3.1143982e+00 ...  2.4072502e+00\n",
+      "  -2.0490403e+00  3.3617332e+00]]\n",
+      "FF:[[ 1.22235850e+06 -2.63485275e+06 -5.07602656e+05 ...  3.02793250e+06\n",
+      "  -3.64456750e+06  3.31921800e+06]\n",
+      " [-4.24962585e+02  1.15766296e+03  9.83978577e+02 ...  1.64807898e+03\n",
+      "  -5.96972351e+02  6.26271790e+02]\n",
+      " [-2.20120354e+01  6.60979462e+01  3.99340210e+01 ...  5.71033745e+01\n",
+      "  -1.59687757e+01  3.65366316e+00]\n",
+      " ...\n",
+      " [-1.23020661e+00  5.30526114e+00  2.19826817e+00 ...  1.39908671e+00\n",
+      "  -5.51325083e-01  4.89858717e-01]\n",
+      " [-1.07714510e+00  6.95712519e+00  2.73729825e+00 ...  4.96630049e+00\n",
+      "  -9.97055829e-01  2.18292713e+00]\n",
+      " [-5.95347941e-01  3.02720070e+00  3.11439991e+00 ...  2.40725493e+00\n",
+      "  -2.04904509e+00  3.36174107e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "7.4363425925925934% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n",
+      "   2.38160934e+01  3.15938339e+01]\n",
+      " [-9.55138900e+06  6.71377197e+02  2.06871887e+02 ... -3.86393509e+01\n",
+      "   2.14816055e+01 -6.58599396e+01]\n",
+      " [ 1.14522670e+07  2.19898975e+03 -6.89673233e+00 ...  9.51593590e+00\n",
+      "  -1.68612709e+01  6.02474251e+01]\n",
+      " ...\n",
+      " [ 2.10891925e+06  3.78648706e+03  1.02701221e+03 ...  3.59794388e+01\n",
+      "   5.03902206e+01  4.19777756e+01]\n",
+      " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ...  9.36443710e+00\n",
+      "   3.84094887e+01 -7.51948738e+00]\n",
+      " [ 7.39155050e+06  1.11731885e+03  3.38369843e+02 ...  3.70399475e+01\n",
+      "   1.77629051e+01  9.76780853e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.52523600e+06 -1.27625293e+03 -4.39336700e+01 ... -3.34414597e+01\n",
+      "   2.38162422e+01  3.15938187e+01]\n",
+      " [-9.55138900e+06  6.71377319e+02  2.06871674e+02 ... -3.86393127e+01\n",
+      "   2.14817867e+01 -6.58600464e+01]\n",
+      " [ 1.14522660e+07  2.19898950e+03 -6.89660644e+00 ...  9.51594448e+00\n",
+      "  -1.68611774e+01  6.02474518e+01]\n",
+      " ...\n",
+      " [ 2.10891850e+06  3.78648633e+03  1.02701196e+03 ...  3.59794846e+01\n",
+      "   5.03901253e+01  4.19777679e+01]\n",
+      " [ 2.11695400e+06 -2.36282440e+02 -1.08002762e+02 ...  9.36448860e+00\n",
+      "   3.84096107e+01 -7.51954842e+00]\n",
+      " [ 7.39155000e+06  1.11731921e+03  3.38370087e+02 ...  3.70398293e+01\n",
+      "   1.77627277e+01  9.76782227e+01]]\n",
+      "6.011284722222222% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  4.9057606e+01\n",
+      "  4.7770348e+01  5.8564331e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  4.90575981e+01\n",
+      "  4.77703362e+01  5.85643845e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   88   138   187   203   232   242   493   657   750   900  1198  1249\n",
+      "  1287  1305  1414  1428  1490  1588  1600  1612  1625  1657  1676  1677\n",
+      "  1692  1694  1724  1730  1772  1822  1825  1838  1853  1910  2035  2043\n",
+      "  2053  2059  2073  2078  2123  2145  2214  2238  2241  2285  2292  2389\n",
+      "  2542  2582  2589  2599  2674  2688  2711  2840  2856  2961  2963  2980\n",
+      "  3064  3176  3192  3255  3262  3278  3338  3341  3412  3419  3492  3590\n",
+      "  3624  3646  3657  3807  3840  3842  3846  3883  3887  4005  4049  4071\n",
+      "  4076  4077  4079  4137  4142  4192  4193  4202  4218  4224  4273  4355\n",
+      "  4358  4381  4401  4435  4469  4499  4514  4546  4598  4619  4747  4846\n",
+      "  4872  4916  4952  4966  5016  5067  5107  5112  5116  5194  5225  5350\n",
+      "  5364  5403  5515  5537  5550  5578  5650  5653  5654  5736  5751  5837\n",
+      "  5870  5881  5972  5998  6006  6051  6061  6107  6129  6204  6236  6292\n",
+      "  6296  6327  6382  6393  6403  6420  6424  6436  6468  6542  6599  6675\n",
+      "  6681  6711  6723  6767  6823  6914  6983  7047  7064  7133  7167  7197\n",
+      "  7198  7209  7528  7537  7538  7686  7850  7855  7889  7910  7919  7927\n",
+      "  7937  7939  8089  8101  8157  8169  8175  8223  8292  8304  8306  8342\n",
+      "  8351  8414  8475  8500  8543  8558  8609  8656  8687  8704  8724  8726\n",
+      "  8777  8816  8826  8871  8904  8934  8983  9012  9033  9043  9068  9093\n",
+      "  9125  9133  9144  9151  9154  9217  9222  9320  9335  9367  9398  9421\n",
+      "  9434  9521  9547  9633  9702  9726  9763  9949 10018 10053 10062 10079\n",
+      " 10137 10149 10203 10261 10269 10292 10312 10332 10471 10478 10514 10596\n",
+      " 10645 10676 10678 10781 10795 10810 10833 10891 10904 10935 10957 10977\n",
+      " 10982 11028 11095 11172 11223 11251 11283 11303 11319 11374 11392 11437\n",
+      " 11486 11627 11678 11750 11759 11979 11996 12019 12126 12237 12262 12288\n",
+      " 12303 12309 12315 12387 12543 12569 12613 12648 12786 12852 12866 12879\n",
+      " 12947 12963 13037 13058 13261 13284 13312 13394 13399 13427 13526 13527\n",
+      " 13592 13695 13741 13752 13775 13803 13812 13866 13902 14049 14170 14241\n",
+      " 14354 14382 14426 14451 14455 14486 14502 14582 14820 14934 14961 14976\n",
+      " 15000 15003 15014 15077 15096 15108 15135 15148 15165 15219 15232 15290\n",
+      " 15339 15345 15819 15945 15994 16077 16135 16218 16231 16233 16239 16243\n",
+      " 16295 16311 16339 16356 16366 16417 16456 16498 16502 16503 16506 16547\n",
+      " 16585 16603 16611 16633 16661 16683 16704 16710 16723 16724 16745 16754\n",
+      " 16773 16787 16789 16818 16829 16833 16913 16933 17025 17033 17037 17055\n",
+      " 17084 17098 17109 17176 17225 17240 17292 17294 17339 17390 17427 17437\n",
+      " 17579 17626 17630 17654 17719 17902 17912 18023 18025 18124 18203 18339\n",
+      " 18344]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  4.9057606e+01\n",
+      "  4.7770348e+01  5.8564331e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  4.90575981e+01\n",
+      "  4.77703362e+01  5.85643845e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 88 138 187 203 232 242 493 657 750]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 4.7819588e+07  3.8833264e+07  4.7789860e+07 ...  1.0804405e+00\n",
+      "  2.7186510e-01 -2.9918199e+00]\n",
+      "FF:[ 4.78195960e+07  3.88332640e+07  4.77898600e+07 ...  1.08044124e+00\n",
+      "  2.71864563e-01 -2.99182224e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 109  211  312  422  590  832  835 1016 1053 1076 1268 1353 1374 1693\n",
+      " 1701 1710 1722 1832 1954 1965 1997 2076 2124 2146 2378 2520 2605 2624\n",
+      " 2967 3007 3015]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 3.3558659e+09  1.3409817e+10 -1.4671958e+10 ...  7.2100967e+01\n",
+      "  6.5979071e+00 -2.1230124e+01]\n",
+      "FF:[ 3.35586406e+09  1.34098166e+10 -1.46719611e+10 ...  7.21009750e+01\n",
+      "  6.59790993e+00 -2.12301121e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   4   95  111  163  179  191  279  305  363  406  447  487  489  494\n",
+      "  517  617  703  713  735  796  805  819  826  858  882  959  964  967\n",
+      "  986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n",
+      " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n",
+      " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n",
+      " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 3.3558659e+09  1.3409817e+10 -1.4671958e+10 ...  7.2100967e+01\n",
+      "  6.5979071e+00 -2.1230124e+01]\n",
+      "FF:[ 3.35586406e+09  1.34098166e+10 -1.46719611e+10 ...  7.21009750e+01\n",
+      "  6.59790993e+00 -2.12301121e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   4   95  111  163  179  191  279  305  363  406  447  487  489  494\n",
+      "  517  617  703  713  735  796  805  819  826  858  882  959  964  967\n",
+      "  986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n",
+      " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n",
+      " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n",
+      " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  9.3464905e+01\n",
+      "  7.5613129e+01  7.6598846e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  9.34649200e+01\n",
+      "  7.56131058e+01  7.65989227e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 88 138 187 203 232 242 493 657 750]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_o_proj_in_grad\n",
+      "HF: [-9.4470595e+09 -7.3870331e+09  1.2659395e+10 ... -2.8149616e+01\n",
+      "  1.7019112e+02 -7.7236428e+00]\n",
+      "FF:[-9.44706150e+09 -7.38703309e+09  1.26593966e+10 ... -2.81496239e+01\n",
+      "  1.70191177e+02 -7.72364044e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 11  98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-9.44705946e+09  2.28078384e+01  3.18554016e+02 ...  1.17267204e+02\n",
+      "   2.06791725e+01  1.13138672e+02]\n",
+      " [-7.38703309e+09 -7.36898804e+00  7.93705673e+01 ...  2.04039650e+01\n",
+      "   3.18331490e+01  5.44241562e+01]\n",
+      " [ 1.26593946e+10  1.77534424e+02 -2.97175941e+01 ...  1.16716766e+01\n",
+      "   7.70214081e+01  2.81902496e+02]\n",
+      " ...\n",
+      " [ 4.51210445e+10  3.63867615e+02 -8.04915466e+01 ... -1.34332123e+02\n",
+      "  -1.22151840e+02 -2.81496162e+01]\n",
+      " [-1.39591885e+10  1.59216873e+02  6.11343079e+01 ...  1.56675262e+02\n",
+      "   9.68551483e+01  1.70191116e+02]\n",
+      " [-1.29442345e+10 -2.39441833e+02  2.73647644e+02 ... -4.41197014e+01\n",
+      "  -9.48526230e+01 -7.72364283e+00]]\n",
+      "FF:[[-9.44706150e+09  2.28079376e+01  3.18553864e+02 ...  1.17267227e+02\n",
+      "   2.06791859e+01  1.13138741e+02]\n",
+      " [-7.38703309e+09 -7.36921692e+00  7.93703690e+01 ...  2.04038925e+01\n",
+      "   3.18332825e+01  5.44241333e+01]\n",
+      " [ 1.26593966e+10  1.77534454e+02 -2.97174206e+01 ...  1.16717224e+01\n",
+      "   7.70213699e+01  2.81902618e+02]\n",
+      " ...\n",
+      " [ 4.51210527e+10  3.63867554e+02 -8.04915695e+01 ... -1.34332092e+02\n",
+      "  -1.22151901e+02 -2.81496239e+01]\n",
+      " [-1.39591834e+10  1.59216995e+02  6.11343040e+01 ...  1.56675293e+02\n",
+      "   9.68551559e+01  1.70191177e+02]\n",
+      " [-1.29442304e+10 -2.39441772e+02  2.73647644e+02 ... -4.41196594e+01\n",
+      "  -9.48526916e+01 -7.72364044e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 11  98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-9.44705946e+09 -7.38703309e+09  1.26593946e+10 ...  4.51210445e+10\n",
+      "  -1.39591885e+10 -1.29442345e+10]\n",
+      " [ 1.14852783e+03  4.39543152e+02  1.07877356e+03 ... -2.42416113e+03\n",
+      "   2.64504834e+03  4.68633453e+02]\n",
+      " [ 5.72417107e+01  4.12602806e+01 -2.27319489e+01 ... -3.40788422e+01\n",
+      "   4.86237946e+01  1.25752163e+01]\n",
+      " ...\n",
+      " [ 6.76848269e+00  8.23165894e+00  2.10253639e+01 ... -3.19590777e-01\n",
+      "   3.68098617e-01 -1.95310101e-01]\n",
+      " [ 4.08574820e+00  5.33035660e+00  1.41003275e+01 ... -1.35607815e+00\n",
+      "   4.06074905e+00 -7.67630756e-01]\n",
+      " [ 2.03186665e+01  9.77407932e+00  5.06271019e+01 ... -6.80029154e-01\n",
+      "   4.11142111e+00 -1.86585218e-01]]\n",
+      "FF:[[-9.44706150e+09 -7.38703309e+09  1.26593966e+10 ...  4.51210527e+10\n",
+      "  -1.39591834e+10 -1.29442304e+10]\n",
+      " [ 1.14852808e+03  4.39542755e+02  1.07877344e+03 ... -2.42416138e+03\n",
+      "   2.64504932e+03  4.68633698e+02]\n",
+      " [ 5.72415771e+01  4.12602005e+01 -2.27318707e+01 ... -3.40787392e+01\n",
+      "   4.86236725e+01  1.25752039e+01]\n",
+      " ...\n",
+      " [ 6.76847696e+00  8.23167515e+00  2.10253181e+01 ... -3.19590837e-01\n",
+      "   3.68098557e-01 -1.95310280e-01]\n",
+      " [ 4.08574867e+00  5.33037567e+00  1.41003180e+01 ... -1.35607564e+00\n",
+      "   4.06074095e+00 -7.67629445e-01]\n",
+      " [ 2.03186874e+01  9.77407932e+00  5.06271439e+01 ... -6.80029511e-01\n",
+      "   4.11142349e+00 -1.86585203e-01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "6.640625% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-5.1505955e+10 -4.7166772e+03 -1.3288132e+02 ... -3.0123844e+00\n",
+      "  -5.5234032e+01  6.0299168e+00]\n",
+      " [-3.5960029e+10 -5.3263096e+03 -1.9434322e+02 ... -5.6601189e+01\n",
+      "  -1.0787462e+02 -6.0718418e+01]\n",
+      " [ 4.8131662e+10  1.1578307e+04  1.7744476e+02 ... -5.6970375e+01\n",
+      "  -1.7497168e+01 -7.2297249e+00]\n",
+      " ...\n",
+      " [-9.0346426e+08  6.4752144e+03  3.2408417e+02 ...  6.1075470e+01\n",
+      "   8.5356834e+01  8.3221588e+01]\n",
+      " [-5.0754217e+09 -2.2929268e+03 -1.4913528e+02 ...  8.6639397e+01\n",
+      "   1.1156468e+02  1.0695674e+02]\n",
+      " [ 5.5844772e+09  3.0225920e+03 -6.3137859e+01 ... -6.5270996e+01\n",
+      "   8.2730171e+01 -1.0107367e+02]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-5.15059548e+10 -4.71667773e+03 -1.32881012e+02 ... -3.01225996e+00\n",
+      "  -5.52339973e+01  6.02991867e+00]\n",
+      " [-3.59600292e+10 -5.32630957e+03 -1.94343079e+02 ... -5.66010437e+01\n",
+      "  -1.07874649e+02 -6.07182846e+01]\n",
+      " [ 4.81316659e+10  1.15783076e+04  1.77444519e+02 ... -5.69703102e+01\n",
+      "  -1.74972763e+01 -7.22990799e+00]\n",
+      " ...\n",
+      " [-9.03455232e+08  6.47521484e+03  3.24083832e+02 ...  6.10753632e+01\n",
+      "   8.53567886e+01  8.32217255e+01]\n",
+      " [-5.07543654e+09 -2.29292749e+03 -1.49135025e+02 ...  8.66392517e+01\n",
+      "   1.11564789e+02  1.06956917e+02]\n",
+      " [ 5.58446592e+09  3.02259229e+03 -6.31376152e+01 ... -6.52709351e+01\n",
+      "   8.27302551e+01 -1.01073837e+02]]\n",
+      "7.025824652777778% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-6.33203254e+13 -4.43651289e+13  6.35509366e+13 ...  1.08435585e+02\n",
+      "  9.42303467e+01  5.89958420e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  1.08435623e+02\n",
+      "  9.42303467e+01  5.89958954e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   26    51    66    85   259   262   272   296   298   329   392   415\n",
+      "   428   482   492   514   526   531   671   731   763   777   893   927\n",
+      "   984  1105  1184  1206  1418  1541  1548  1572  1577  1613  1619  1643\n",
+      "  1658  1661  1691  1701  1706  1726  1757  1784  1815  1833  1849  1856\n",
+      "  1880  1891  1921  1956  1969  2012  2021  2028  2030  2059  2065  2144\n",
+      "  2149  2183  2210  2238  2292  2342  2357  2384  2414  2495  2531  2565\n",
+      "  2597  2662  2713  2781  2821  2829  2877  2904  2921  2927  2962  2973\n",
+      "  3044  3066  3094  3100  3106  3159  3193  3251  3377  3389  3397  3427\n",
+      "  3436  3570  3594  3703  3729  3770  3772  3780  3811  3840  3842  3860\n",
+      "  3907  3920  3929  3946  3955  3969  4005  4009  4034  4048  4077  4089\n",
+      "  4104  4129  4134  4178  4202  4212  4219  4239  4245  4256  4273  4373\n",
+      "  4407  4463  4464  4465  4481  4511  4537  4541  4543  4549  4597  4599\n",
+      "  4633  4759  4760  4789  4846  4884  4901  4930  4954  4971  4993  5024\n",
+      "  5030  5041  5050  5116  5130  5163  5207  5224  5282  5313  5322  5349\n",
+      "  5363  5403  5410  5412  5454  5543  5581  5590  5654  5673  5784  5821\n",
+      "  5849  5880  5911  5917  5982  6000  6062  6165  6178  6193  6200  6272\n",
+      "  6322  6351  6366  6376  6380  6382  6393  6412  6420  6430  6433  6446\n",
+      "  6476  6482  6488  6490  6519  6527  6540  6556  6563  6567  6577  6600\n",
+      "  6619  6680  6709  6735  6768  6777  6780  6823  6825  6826  6830  6863\n",
+      "  6880  6912  6988  7006  7030  7071  7077  7102  7123  7244  7264  7367\n",
+      "  7389  7390  7434  7451  7452  7455  7505  7532  7539  7589  7598  7620\n",
+      "  7651  7653  7659  7709  7714  7740  7751  7759  7803  7808  7820  7917\n",
+      "  7923  7926  7949  7962  7966  7978  8002  8004  8040  8050  8052  8068\n",
+      "  8180  8223  8250  8253  8265  8341  8344  8375  8376  8386  8449  8468\n",
+      "  8501  8509  8522  8535  8585  8590  8593  8642  8657  8674  8687  8707\n",
+      "  8714  8726  8729  8737  8756  8769  8801  8846  8850  8865  8907  8998\n",
+      "  9018  9043  9059  9066  9083  9093  9098  9130  9131  9165  9189  9216\n",
+      "  9285  9337  9368  9526  9539  9563  9620  9659  9723  9793  9804  9817\n",
+      "  9820  9827  9908  9995 10053 10128 10135 10143 10205 10253 10274 10292\n",
+      " 10300 10311 10327 10356 10406 10441 10491 10494 10551 10562 10563 10634\n",
+      " 10649 10674 10710 10734 10821 10831 10833 10838 10845 10911 10966 10981\n",
+      " 10988 10990 10998 11008 11044 11049 11100 11127 11141 11197 11250 11269\n",
+      " 11285 11308 11361 11383 11437 11460 11494 11502 11511 11522 11546 11557\n",
+      " 11564 11588 11649 11658 11671 11674 11703 11729 11749 11759 11832 11892\n",
+      " 11979 11988 12000 12038 12063 12078 12107 12119 12165 12259 12269 12270\n",
+      " 12347 12369 12386 12415 12475 12518 12566 12569 12574 12652 12693 12792\n",
+      " 12833 12834 12852 12872 12900 12946 13117 13121 13124 13321 13345 13357\n",
+      " 13427 13431 13446 13473 13526 13635 13638 13662 13706 13733 13803 13807\n",
+      " 13852 13882 13912 13924 13962 13969 13986 14023 14036 14046 14085 14110\n",
+      " 14130 14141 14175 14183 14191 14220 14222 14223 14285 14310 14331 14336\n",
+      " 14354 14375 14425 14427 14451 14482 14493 14516 14560 14563 14581 14623\n",
+      " 14671 14677 14679 14680 14685 14688 14742 14799 14860 14868 14870 14872\n",
+      " 14900 14909 14916 14940 14964 14991 15003 15023 15027 15033 15038 15051\n",
+      " 15086 15100 15184 15214 15232 15290 15352 15363 15365 15407 15433 15451\n",
+      " 15522 15577 15707 15720 15725 15739 15830 15837 15875 15937 15965 15985\n",
+      " 16017 16054 16113 16136 16142 16169 16191 16232 16238 16250 16268 16282\n",
+      " 16285 16290 16295 16304 16327 16334 16353 16356 16363 16382 16403 16407\n",
+      " 16408 16409 16458 16459 16495 16497 16499 16500 16516 16532 16595 16603\n",
+      " 16611 16657 16678 16680 16695 16701 16704 16754 16768 16807 16818 16856\n",
+      " 16870 16951 16971 16986 16989 16992 17048 17134 17181 17208 17217 17236\n",
+      " 17243 17319 17363 17398 17448 17471 17497 17557 17646 17654 17659 17692\n",
+      " 17754 17947 17957 17969 17975 18029 18128 18146 18196 18206 18207 18250\n",
+      " 18265 18313 18406]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-6.33203254e+13 -4.43651289e+13  6.35509366e+13 ...  1.08435585e+02\n",
+      "  9.42303467e+01  5.89958420e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  1.08435623e+02\n",
+      "  9.42303467e+01  5.89958954e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 26  51  66  85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n",
+      " 671 731 763]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 5.0590863e+10  3.7823513e+11 -5.0394451e+11 ... -5.5814421e-01\n",
+      "  2.2970559e-01 -1.2293311e+00]\n",
+      "FF:[ 5.05906831e+10  3.78235290e+11 -5.03944544e+11 ... -5.58144033e-01\n",
+      "  2.29705781e-01 -1.22933090e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 189  254  317  418  515  546  577  634  636  675  712  808 1011 1030\n",
+      " 1080 1091 1132 1168 1254 1265 1285 1287 1354 1381 1427 1459 1506 1620\n",
+      " 1654 1752 1887 1897 1900 1937 1981 1985 1986 2003 2029 2152 2181 2295\n",
+      " 2395 2426 2445 2673 2687 2859 2947 2977 3037]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n",
+      "  1.2655228e+01  7.1020460e+00]\n",
+      "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n",
+      "  1.26551876e+01  7.10206795e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   9   49  113  174  243  267  271  288  323  335  397  399  438  439\n",
+      "  457  475  506  568  569  652  680  689  715  735  739  758  766  777\n",
+      "  785  837  842  852  865  884  893  919  930  932  936  939  957 1018\n",
+      " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n",
+      " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n",
+      " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n",
+      " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n",
+      " 2932 2948 3018 3034 3066]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n",
+      "  1.2655228e+01  7.1020460e+00]\n",
+      "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n",
+      "  1.26551876e+01  7.10206795e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   9   49  113  174  243  267  271  288  323  335  397  399  438  439\n",
+      "  457  475  506  568  569  652  680  689  715  735  739  758  766  777\n",
+      "  785  837  842  852  865  884  893  919  930  932  936  939  957 1018\n",
+      " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n",
+      " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n",
+      " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n",
+      " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n",
+      " 2932 2948 3018 3034 3066]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_output_0\n",
+      "HF: [-6.3320325e+13 -4.4365129e+13  6.3550937e+13 ...  7.2449814e+01\n",
+      "  8.6617142e+01  8.3981407e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  7.24498901e+01\n",
+      "  8.66170959e+01  8.39814606e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 26  51  66  85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n",
+      " 671 731 763]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_o_proj_in_grad\n",
+      "HF: [ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ...  2.5297220e+02\n",
+      " -8.1722275e+01 -7.0014725e+01]\n",
+      "FF:[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ...  2.52972260e+02\n",
+      " -8.17222137e+01 -7.00146637e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  6  36  43  55  60  82 101 110 117 217 221 229 236 256 289 392 421 429\n",
+      " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 7.28854608e+13  6.37500977e+02  2.96775421e+02 ...  8.35403061e+01\n",
+      "   1.72460327e+02  2.90482426e+01]\n",
+      " [-6.08358210e+13 -5.23222847e+01 -2.34542664e+02 ... -1.87500763e+01\n",
+      "  -8.99429398e+01  8.64021378e+01]\n",
+      " [-7.97326117e+13 -4.24736328e+02 -1.82208099e+02 ...  3.21808720e+00\n",
+      "  -5.87415466e+01 -2.08511108e+02]\n",
+      " ...\n",
+      " [-1.13411917e+14 -3.48418640e+02  1.52205795e+02 ...  1.51519928e+02\n",
+      "   2.45651031e+02  2.52972198e+02]\n",
+      " [-3.75985275e+12  2.39696625e+02  1.51989685e+02 ... -2.85605354e+01\n",
+      "  -1.79121232e+00 -8.17222748e+01]\n",
+      " [ 1.11016038e+14 -1.96372967e+01 -1.27668396e+02 ...  3.35008011e+01\n",
+      "  -7.46116943e+01 -7.00147247e+01]]\n",
+      "FF:[[ 7.28854608e+13  6.37500977e+02  2.96775513e+02 ...  8.35403976e+01\n",
+      "   1.72460068e+02  2.90483646e+01]\n",
+      " [-6.08357832e+13 -5.23225098e+01 -2.34542755e+02 ... -1.87501526e+01\n",
+      "  -8.99431992e+01  8.64022217e+01]\n",
+      " [-7.97326201e+13 -4.24736572e+02 -1.82207733e+02 ...  3.21793270e+00\n",
+      "  -5.87416573e+01 -2.08511139e+02]\n",
+      " ...\n",
+      " [-1.13411925e+14 -3.48418640e+02  1.52205902e+02 ...  1.51519714e+02\n",
+      "   2.45650864e+02  2.52972260e+02]\n",
+      " [-3.75988630e+12  2.39696686e+02  1.51989319e+02 ... -2.85606136e+01\n",
+      "  -1.79138493e+00 -8.17222137e+01]\n",
+      " [ 1.11016046e+14 -1.96372318e+01 -1.27668480e+02 ...  3.35009079e+01\n",
+      "  -7.46116791e+01 -7.00146637e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[  6  36  43  55  60  82 101 110 117 217 221 229 236 256 289 392 421 429\n",
+      " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... -1.1341192e+14\n",
+      "  -3.7598527e+12  1.1101604e+14]\n",
+      " [ 3.3241980e+03 -6.3044128e+02 -3.0447307e+03 ...  3.0137921e+02\n",
+      "   3.8262988e+02 -4.2889914e+02]\n",
+      " [ 3.5639046e+01 -1.6155790e+01 -2.4461178e+01 ...  2.7450909e+02\n",
+      "   1.6181946e+02 -2.5407137e+02]\n",
+      " ...\n",
+      " [ 4.6487908e+00 -9.6633381e-01 -2.7078497e-01 ...  3.6374569e+01\n",
+      "  -1.7563061e+00 -7.1206141e+00]\n",
+      " [ 1.8901447e+00  8.9006472e-01 -4.3125896e+00 ...  2.6014965e+01\n",
+      "  -3.7720141e-01 -7.8855257e+00]\n",
+      " [ 1.9513500e+00  5.8041654e+00 -1.4006979e+01 ...  7.2743622e+01\n",
+      "  -2.3499712e+01 -2.0133139e+01]]\n",
+      "FF:[[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... -1.13411925e+14\n",
+      "  -3.75988630e+12  1.11016046e+14]\n",
+      " [ 3.32419922e+03 -6.30442505e+02 -3.04472998e+03 ...  3.01379364e+02\n",
+      "   3.82629669e+02 -4.28898712e+02]\n",
+      " [ 3.56390572e+01 -1.61558037e+01 -2.44611683e+01 ...  2.74509308e+02\n",
+      "   1.61819229e+02 -2.54071594e+02]\n",
+      " ...\n",
+      " [ 4.64879847e+00 -9.66338813e-01 -2.70792574e-01 ...  3.63745117e+01\n",
+      "  -1.75632846e+00 -7.12060070e+00]\n",
+      " [ 1.89013767e+00  8.90062451e-01 -4.31257772e+00 ...  2.60149212e+01\n",
+      "  -3.77217919e-01 -7.88551569e+00]\n",
+      " [ 1.95135939e+00  5.80417490e+00 -1.40069904e+01 ...  7.27435226e+01\n",
+      "  -2.34996586e+01 -2.01330910e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "7.609953703703703% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-1.17282076e+14 -2.12461621e+03  8.80099030e+01 ...  4.34470520e+01\n",
+      "   7.55885468e+01 -2.88791332e+01]\n",
+      " [-2.07757936e+14 -3.81796265e+02 -2.33774780e+02 ...  8.11984329e+01\n",
+      "  -4.41825638e+01  7.35064125e+00]\n",
+      " [ 4.11484165e+13  2.50572113e+02  1.91601822e+02 ...  1.00269365e+01\n",
+      "  -3.41638985e+01  1.20433075e+02]\n",
+      " ...\n",
+      " [ 7.95562329e+13  1.55007373e+03  1.70351212e+02 ... -1.80320053e+01\n",
+      "   8.77533417e+01  2.14678173e+01]\n",
+      " [-1.86546485e+14 -5.18847070e+03 -3.34331085e+02 ...  2.51586838e+01\n",
+      "  -4.06135368e+01 -6.27860641e+00]\n",
+      " [ 1.89751705e+14 -3.09853809e+03 -1.18278351e+01 ... -1.24640663e+02\n",
+      "   1.59719009e+01 -6.47173615e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-1.17282034e+14 -2.12461694e+03  8.80101547e+01 ...  4.34468918e+01\n",
+      "   7.55886002e+01 -2.88791542e+01]\n",
+      " [-2.07757920e+14 -3.81795776e+02 -2.33774765e+02 ...  8.11985397e+01\n",
+      "  -4.41825829e+01  7.35066986e+00]\n",
+      " [ 4.11484543e+13  2.50570099e+02  1.91601196e+02 ...  1.00270777e+01\n",
+      "  -3.41638451e+01  1.20433121e+02]\n",
+      " ...\n",
+      " [ 7.95562413e+13  1.55007288e+03  1.70350784e+02 ... -1.80321960e+01\n",
+      "   8.77533112e+01  2.14678249e+01]\n",
+      " [-1.86546469e+14 -5.18847070e+03 -3.34331268e+02 ...  2.51588135e+01\n",
+      "  -4.06132622e+01 -6.27861023e+00]\n",
+      " [ 1.89751521e+14 -3.09853711e+03 -1.18275299e+01 ... -1.24640862e+02\n",
+      "   1.59719791e+01 -6.47173767e+01]]\n",
+      "7.530381944444445% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  7.7873253e+01\n",
+      "  8.6085976e+01  6.8200005e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  7.78733292e+01\n",
+      "  8.60859299e+01  6.82000580e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[    3    24    66    71    94    95   124   134   141   150   163   181\n",
+      "   226   261   284   318   320   378   382   385   391   395   403   422\n",
+      "   434   495   515   523   524   549   579   610   644   710   764   772\n",
+      "   870   984   987  1045  1249  1330  1362  1489  1517  1550  1556  1588\n",
+      "  1595  1659  1672  1684  1689  1768  1792  1799  1808  1818  1842  1871\n",
+      "  1889  1899  1910  1915  1925  1936  1993  1997  2033  2041  2059  2062\n",
+      "  2066  2098  2111  2124  2129  2130  2146  2153  2159  2166  2197  2206\n",
+      "  2210  2212  2222  2234  2237  2320  2321  2357  2359  2362  2385  2428\n",
+      "  2518  2539  2553  2568  2598  2683  2689  2694  2711  2714  2733  2787\n",
+      "  2788  2795  2811  2815  2853  2881  2890  2917  2981  2997  3021  3037\n",
+      "  3089  3149  3163  3191  3196  3217  3225  3248  3277  3287  3292  3305\n",
+      "  3327  3361  3385  3402  3417  3425  3456  3479  3516  3521  3528  3555\n",
+      "  3587  3599  3608  3684  3702  3733  3770  3779  3819  3822  3823  3898\n",
+      "  3921  3942  3950  4012  4053  4077  4086  4091  4139  4185  4198  4225\n",
+      "  4241  4296  4347  4349  4368  4403  4407  4418  4453  4471  4472  4473\n",
+      "  4494  4537  4549  4555  4558  4598  4623  4648  4666  4698  4729  4782\n",
+      "  4848  4866  4886  4943  4959  5008  5010  5012  5057  5079  5177  5178\n",
+      "  5186  5211  5271  5281  5296  5313  5328  5356  5364  5409  5429  5440\n",
+      "  5453  5455  5457  5476  5529  5563  5591  5621  5625  5631  5654  5661\n",
+      "  5692  5705  5720  5740  5751  5758  5787  5799  5813  5835  5836  5867\n",
+      "  5872  5893  5953  5974  5980  5982  6000  6055  6082  6086  6102  6107\n",
+      "  6123  6159  6172  6193  6220  6230  6231  6263  6286  6297  6362  6396\n",
+      "  6401  6430  6436  6485  6497  6499  6502  6510  6537  6554  6555  6563\n",
+      "  6564  6579  6586  6598  6615  6625  6626  6649  6651  6661  6754  6764\n",
+      "  6776  6852  6863  6874  6883  6892  6913  6945  6969  7036  7057  7066\n",
+      "  7082  7138  7147  7150  7157  7197  7202  7231  7234  7235  7240  7270\n",
+      "  7278  7287  7322  7327  7345  7348  7361  7390  7402  7490  7539  7573\n",
+      "  7610  7714  7721  7758  7794  7812  7827  7829  7837  7839  7882  7894\n",
+      "  7943  7948  7952  7969  7975  7996  8024  8027  8037  8043  8055  8078\n",
+      "  8079  8088  8090  8095  8154  8258  8264  8283  8297  8313  8329  8336\n",
+      "  8359  8361  8376  8383  8416  8421  8428  8454  8475  8502  8521  8613\n",
+      "  8642  8653  8696  8756  8764  8777  8791  8837  8849  8859  8878  8955\n",
+      "  8991  8997  9006  9012  9040  9066  9093  9097  9098  9131  9158  9162\n",
+      "  9165  9214  9216  9280  9297  9301  9316  9355  9371  9412  9421  9475\n",
+      "  9510  9580  9620  9645  9696  9713  9732  9768  9802  9817  9819  9826\n",
+      "  9839  9846  9947 10004 10062 10065 10072 10103 10107 10108 10138 10167\n",
+      " 10173 10228 10262 10292 10326 10356 10360 10372 10421 10446 10466 10468\n",
+      " 10499 10505 10513 10517 10589 10606 10612 10645 10664 10669 10726 10777\n",
+      " 10835 10838 10839 10848 10855 10877 10897 10941 10963 10971 10977 10997\n",
+      " 11030 11060 11065 11076 11088 11140 11167 11174 11231 11252 11257 11259\n",
+      " 11275 11297 11302 11319 11331 11333 11357 11358 11380 11382 11402 11423\n",
+      " 11446 11447 11500 11501 11522 11585 11623 11670 11728 11736 11759 11761\n",
+      " 11772 11785 11839 11894 11916 11924 11936 11962 11968 11969 11977 11984\n",
+      " 12008 12030 12054 12074 12123 12175 12182 12194 12237 12262 12282 12285\n",
+      " 12341 12348 12351 12370 12376 12386 12399 12449 12507 12513 12518 12522\n",
+      " 12549 12572 12643 12648 12663 12689 12696 12710 12769 12780 12788 12792\n",
+      " 12793 12852 12864 12879 12884 12985 13018 13041 13057 13176 13264 13272\n",
+      " 13274 13275 13292 13303 13333 13379 13427 13428 13442 13451 13454 13500\n",
+      " 13510 13533 13564 13588 13607 13640 13655 13686 13687 13688 13732 13747\n",
+      " 13786 13801 13803 13826 13841 13846 13850 13892 13909 13946 14036 14040\n",
+      " 14046 14060 14080 14152 14161 14183 14195 14210 14240 14278 14331 14354\n",
+      " 14370 14372 14386 14395 14409 14432 14434 14497 14506 14531 14559 14589\n",
+      " 14648 14663 14686 14698 14715 14743 14757 14799 14808 14810 14849 14893\n",
+      " 14902 14929 14937 14947 14953 14958 15005 15012 15018 15036 15066 15069\n",
+      " 15083 15152 15154 15196 15197 15212 15292 15309 15323 15340 15343 15375\n",
+      " 15389 15396 15408 15410 15454 15499 15532 15557 15605 15647 15677 15736\n",
+      " 15745 15756 15769 15809 15824 15876 15882 15900 15906 15941 16027 16030\n",
+      " 16040 16116 16190 16192 16205 16207 16239 16279 16285 16295 16348 16358\n",
+      " 16367 16384 16386 16394 16399 16455 16457 16458 16471 16495 16500 16502\n",
+      " 16520 16541 16542 16598 16623 16643 16651 16665 16673 16679 16713 16725\n",
+      " 16734 16736 16739 16751 16756 16768 16861 16870 16939 16976 17007 17028\n",
+      " 17040 17069 17087 17108 17125 17139 17151 17158 17174 17175 17178 17182\n",
+      " 17189 17221 17258 17341 17360 17370 17381 17395 17396 17415 17432 17450\n",
+      " 17463 17470 17472 17473 17496 17507 17536 17608 17626 17627 17649 17653\n",
+      " 17664 17771 17815 17822 17831 17864 17883 17931 17994 17999 18035 18174\n",
+      " 18209 18250 18274 18307 18327 18403 18423]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  7.7873253e+01\n",
+      "  8.6085976e+01  6.8200005e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  7.78733292e+01\n",
+      "  8.60859299e+01  6.82000580e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  3  24  66  71  94  95 124 134 141 150 163 181 226 261 284 318 320 378\n",
+      " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 6.5550952e+14  4.9376585e+14  3.8510841e+14 ...  1.6802770e+00\n",
+      " -1.1248941e+00 -1.1701980e+00]\n",
+      "FF:[ 6.55509317e+14  4.93765882e+14  3.85108377e+14 ...  1.68027747e+00\n",
+      " -1.12489426e+00 -1.17019880e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   6   79  111  149  155  168  187  195  220  223  252  261  329  343\n",
+      "  347  369  386  392  403  438  439  450  461  524  535  643  656  659\n",
+      "  661  668  722  727  732  742  754  801  816  820  835  837  849  850\n",
+      "  978  993  997 1012 1019 1034 1044 1071 1088 1094 1114 1135 1151 1170\n",
+      " 1190 1212 1273 1275 1277 1289 1290 1308 1311 1337 1364 1379 1394 1430\n",
+      " 1454 1460 1469 1474 1703 1725 1728 1732 1733 1741 1754 1757 1804 1806\n",
+      " 1856 1862 1932 1945 1996 2030 2044 2045 2065 2071 2075 2094 2149 2152\n",
+      " 2163 2180 2182 2215 2254 2357 2362 2370 2392 2398 2428 2484 2519 2521\n",
+      " 2524 2582 2618 2641 2645 2664 2674 2681 2691 2735 2747 2779 2872 2899\n",
+      " 2909 2935 2957 3000 3033]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [-1.3871785e+17 -8.3164397e+16  4.9509505e+16 ...  4.3806694e+01\n",
+      "  9.4386072e+00 -2.4460859e+01]\n",
+      "FF:[-1.38717840e+17 -8.31644654e+16  4.95094495e+16 ...  4.38065948e+01\n",
+      "  9.43864822e+00 -2.44608364e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  80   83  172  173  176  184  215  285  329  338  341  395  403  465\n",
+      "  468  565  572  601  614  636  639  651  660  749  750  806  828  844\n",
+      "  873  952  971  988  992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n",
+      " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n",
+      " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n",
+      " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n",
+      " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n",
+      " 2936 2945 2951 3013 3016]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_input_0\n",
+      "HF: [-1.3871785e+17 -8.3164397e+16  4.9509505e+16 ...  4.3806694e+01\n",
+      "  9.4386072e+00 -2.4460859e+01]\n",
+      "FF:[-1.38717840e+17 -8.31644654e+16  4.95094495e+16 ...  4.38065948e+01\n",
+      "  9.43864822e+00 -2.44608364e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  80   83  172  173  176  184  215  285  329  338  341  395  403  465\n",
+      "  468  565  572  601  614  636  639  651  660  749  750  806  828  844\n",
+      "  873  952  971  988  992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n",
+      " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n",
+      " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n",
+      " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n",
+      " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n",
+      " 2936 2945 2951 3013 3016]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  3.5121140e+01\n",
+      " -3.5587997e+00  9.5641022e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  3.51211472e+01\n",
+      " -3.55898285e+00  9.56410980e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  3  24  66  71  94  95 124 134 141 150 163 181 226 261 284 318 320 378\n",
+      " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_o_proj_in_grad\n",
+      "HF: [-1.6186993e+17 -3.5698813e+17  3.4442975e+16 ... -2.5844165e+02\n",
+      "  2.0677340e+01 -2.4573349e+01]\n",
+      "FF:[-1.61869621e+17 -3.56988336e+17  3.44430865e+16 ... -2.58441467e+02\n",
+      "  2.06775093e+01 -2.45735531e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 93  99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n",
+      " 608 702 720 731 759]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-1.6186993e+17 -2.1968115e+02  8.5754425e+01 ... -6.9909119e+01\n",
+      "  -2.6478451e+01 -7.4195160e+01]\n",
+      " [-3.5698813e+17  3.9582391e+02  5.5431940e+02 ...  1.9529277e+02\n",
+      "   1.2558211e+02  6.7965935e+01]\n",
+      " [ 3.4442975e+16  2.8310864e+02 -8.1522171e+01 ... -2.3606525e+01\n",
+      "  -2.0410315e+01 -1.5228156e+02]\n",
+      " ...\n",
+      " [ 4.0923264e+16 -2.4507169e+02 -8.2614380e+02 ... -2.6583340e+02\n",
+      "  -1.9878247e+02 -2.5844165e+02]\n",
+      " [ 6.9156258e+17  1.3969666e+02 -7.5639044e+02 ... -1.5231053e+02\n",
+      "  -3.3650037e+02  2.0677340e+01]\n",
+      " [ 9.9511712e+16 -3.2348724e+01  3.0624988e+02 ...  1.0391423e+02\n",
+      "   6.0626881e+01 -2.4573349e+01]]\n",
+      "FF:[[-1.61869621e+17 -2.19681122e+02  8.57541504e+01 ... -6.99092026e+01\n",
+      "  -2.64783611e+01 -7.41952515e+01]\n",
+      " [-3.56988336e+17  3.95823853e+02  5.54319275e+02 ...  1.95292725e+02\n",
+      "   1.25582062e+02  6.79659348e+01]\n",
+      " [ 3.44430865e+16  2.83108551e+02 -8.15224686e+01 ... -2.36064014e+01\n",
+      "  -2.04101429e+01 -1.52281570e+02]\n",
+      " ...\n",
+      " [ 4.09233933e+16 -2.45071564e+02 -8.26143555e+02 ... -2.65833405e+02\n",
+      "  -1.98782272e+02 -2.58441467e+02]\n",
+      " [ 6.91562577e+17  1.39696579e+02 -7.56390808e+02 ... -1.52310455e+02\n",
+      "  -3.36500092e+02  2.06775093e+01]\n",
+      " [ 9.95114373e+16 -3.23486938e+01  3.06250122e+02 ...  1.03914482e+02\n",
+      "   6.06264191e+01 -2.45735531e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 93  99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n",
+      " 608 702 720 731 759]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-1.6186993e+17 -3.5698813e+17  3.4442975e+16 ...  4.0923264e+16\n",
+      "   6.9156258e+17  9.9511712e+16]\n",
+      " [-5.3483575e+02  2.6249797e+03 -6.7268573e+02 ... -6.1204077e+03\n",
+      "  -4.3047915e+03 -9.5139771e+01]\n",
+      " [-1.2200641e+01  1.0347147e+02 -2.6777636e+01 ... -1.4766699e+02\n",
+      "  -9.8514114e+01  1.2616925e+01]\n",
+      " ...\n",
+      " [-3.2097631e+00  9.1431990e+00 -1.6333975e+00 ... -6.9996667e+00\n",
+      "  -6.4008064e+00  1.9126304e+00]\n",
+      " [-3.0982289e+00  1.2355285e+01 -3.1715555e+00 ... -4.6754313e+00\n",
+      "  -6.2553053e+00  1.0515085e+00]\n",
+      " [-2.9516125e+00  2.7038031e+00 -6.0580249e+00 ... -1.6555168e+01\n",
+      "   1.3245420e+00 -1.5741113e+00]]\n",
+      "FF:[[-1.61869621e+17 -3.56988336e+17  3.44430865e+16 ...  4.09233933e+16\n",
+      "   6.91562577e+17  9.95114373e+16]\n",
+      " [-5.34834961e+02  2.62497900e+03 -6.72686401e+02 ... -6.12040576e+03\n",
+      "  -4.30479297e+03 -9.51402283e+01]\n",
+      " [-1.22006664e+01  1.03471611e+02 -2.67777309e+01 ... -1.47666946e+02\n",
+      "  -9.85141525e+01  1.26169167e+01]\n",
+      " ...\n",
+      " [-3.20977211e+00  9.14321709e+00 -1.63339353e+00 ... -6.99966621e+00\n",
+      "  -6.40081263e+00  1.91262615e+00]\n",
+      " [-3.09821057e+00  1.23552399e+01 -3.17152786e+00 ... -4.67541933e+00\n",
+      "  -6.25528765e+00  1.05149710e+00]\n",
+      " [-2.95161533e+00  2.70380235e+00 -6.05802393e+00 ... -1.65551491e+01\n",
+      "   1.32455230e+00 -1.57412362e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "8.101851851851851% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.3778828e+16  1.0956941e+03  1.1773144e+02 ... -4.0466427e+01\n",
+      "  -3.1198654e+01 -1.7603550e+01]\n",
+      " [-1.2087128e+18  6.9384756e+03  6.1327003e+01 ...  1.5329468e+01\n",
+      "   7.6757736e+00 -4.5589094e+00]\n",
+      " [-6.7892266e+17  5.4895034e+03  7.6927376e+01 ...  9.1396770e+00\n",
+      "   2.3195824e+01 -6.1995559e+00]\n",
+      " ...\n",
+      " [ 2.6452032e+17  9.9761787e+03  2.2349066e+02 ...  5.7504387e+01\n",
+      "  -8.6791611e-01  4.6890911e+01]\n",
+      " [-6.7528534e+16  3.3856902e+03  2.5189743e+02 ...  2.2824722e+01\n",
+      "   8.7917282e+01 -2.1569672e+01]\n",
+      " [-2.1779064e+17  5.2511855e+03  6.6282043e+01 ...  9.9689598e+00\n",
+      "  -5.5022659e+00 -3.2573143e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.37791458e+16  1.09569678e+03  1.17731285e+02 ... -4.04664154e+01\n",
+      "  -3.11988506e+01 -1.76035423e+01]\n",
+      " [-1.20871251e+18  6.93847900e+03  6.13275528e+01 ...  1.53295393e+01\n",
+      "   7.67594433e+00 -4.55900288e+00]\n",
+      " [-6.78922523e+17  5.48950342e+03  7.69272308e+01 ...  9.13961220e+00\n",
+      "   2.31957569e+01 -6.19959354e+00]\n",
+      " ...\n",
+      " [ 2.64520284e+17  9.97617871e+03  2.23490509e+02 ...  5.75044785e+01\n",
+      "  -8.67943764e-01  4.68908234e+01]\n",
+      " [-6.75287400e+16  3.38569165e+03  2.51897339e+02 ...  2.28247147e+01\n",
+      "   8.79171448e+01 -2.15696106e+01]\n",
+      " [-2.17790679e+17  5.25118652e+03  6.62821960e+01 ...  9.96885872e+00\n",
+      "  -5.50213098e+00 -3.25731125e+01]]\n",
+      "9.809027777777777% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.7.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_7_layers_7_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-7.5522525e+19 -1.3283726e+21 -7.2549753e+20 ...  4.9017162e+01\n",
+      " -9.7436657e+00  8.5870697e+01]\n",
+      "FF:[-7.55228501e+19 -1.32837218e+21 -7.25497390e+20 ...  4.90171394e+01\n",
+      " -9.74382782e+00  8.58707886e+01]\n",
+      "[ True  True  True ...  True False  True]\n",
+      "[   19    64    75 ... 18418 18428 18430]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[23], line 95\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     94\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- W2 --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 95\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m compare_tensors(hf_w2_weight, ff_w2_weight, tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m)\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- Lora --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:47\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     43\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     44\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     45\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "attention_tests=True\n",
+    "for i in range(tot_num_layers-1, -1, -1):\n",
+    "    # HuggingFace filepaths\n",
+    "    hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n",
+    "    hf_BWD_loraB_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0\"\n",
+    "    hf_BWD_loraB_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0\"\n",
+    "    hf_BWD_loraA_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0\"\n",
+    "    hf_BWD_loraA_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0\"\n",
+    "    hf_loraA_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    hf_loraB_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    hf_BWD_lora_dropout_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.go_0\"\n",
+    "    hf_BWD_lora_dropout_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.gi_0\"\n",
+    "    hf_BWD_w2_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0\"\n",
+    "    hf_BWD_w2_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0\"\n",
+    "    hf_w2_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.weight\"\n",
+    "    hf_BWD_w3_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0\"\n",
+    "    hf_BWD_w3_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0\"\n",
+    "    hf_BWD_w1_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0\"\n",
+    "    hf_BWD_w1_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0\"\n",
+    "    hf_BWD_act_fn_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0\"\n",
+    "    hf_BWD_act_fn_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.go_0\"\n",
+    "    hf_BWD_ffn_norm_out = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0\"\n",
+    "    hf_BWD_ffn_norm_in = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0\"\n",
+    "    hf_BWD_attn_out_out = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0\"\n",
+    "    hf_BWD_attn_q_in = f\"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n",
+    "    hf_FWD_w1_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
+    "    hf_FWD_w3_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\"\n",
+    "    hf_FWD_act_fn_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0\"\n",
+    "    hf_BWD_attn_oproj_in = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n",
+    "    hf_attn_qproj_weight = f\"{hf_path}/layers.{i}.self_attn.q_proj.weight\"\n",
+    "    hf_attn_kproj_weight = f\"{hf_path}/layers.{i}.self_attn.k_proj.weight\"\n",
+    "    hf_attn_vproj_weight = f\"{hf_path}/layers.{i}.self_attn.v_proj.weight\"\n",
+    "    hf_attn_oproj_weight = f\"{hf_path}/layers.{i}.self_attn.o_proj.weight\"\n",
+    "    \n",
+    "    # FlexFlow filepaths\n",
+    "    ff_BWD_w2_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n",
+    "    ff_BWD_w2_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n",
+    "    ff_BWD_w2_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_pre_input_0\"\n",
+    "    ff_w2_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_weight_0\"\n",
+    "    ff_BWD_ssm_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0\"\n",
+    "    ff_BWD_ssm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0\"\n",
+    "    ff_BWD_ssm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1\"\n",
+    "    ff_BWD_w3_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_BWD_w3_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_input_0\"\n",
+    "    ff_BWD_lora_A_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n",
+    "    ff_BWD_lora_B_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n",
+    "    ff_lora_A_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n",
+    "    ff_lora_B_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n",
+    "    ff_BWD_w1_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_BWD_w1_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_input_0\"\n",
+    "    ff_BWD_w1_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_pre_input_0\"\n",
+    "    ff_w1_weight = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_weight_0\"\n",
+    "    ff_BWD_ffn_norm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_0\"\n",
+    "    ff_BWD_ffn_norm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_1\"\n",
+    "    ff_BWD_ffn_norm_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_0\"\n",
+    "    ff_BWD_attn_out = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n",
+    "    ff_BWD_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_input_0\"\n",
+    "    ff_BWD_ssm_cached_w1_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w1_output\"\n",
+    "    ff_BWD_ssm_cached_w3_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w3_output\"\n",
+    "    ff_FWD_w1_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_FWD_w3_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_FWD_act_fnc_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_act_fn_output\"\n",
+    "    ff_BWD_attn_o_proj_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "    ff_attn_oproj_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_weight_0\"\n",
+    "    \n",
+    "    \n",
+    "    # HuggingFace checks\n",
+    "    print(\"\\nHuggingface checks:\")\n",
+    "    if i == tot_num_layers-1:\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
+    "\n",
+    "    compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n",
+    "    check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n",
+    "    if i == tot_num_layers-1:\n",
+    "        check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n",
+    "\n",
+    "    # FlexFlow checks\n",
+    "    print(\"\\nFlexFlow checks:\")\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n",
+    "    \n",
+    "    # HF-FlexFlow checks\n",
+    "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
+    "    print(\"-- W2 --\")\n",
+    "    compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    \n",
+    "    print(\"-- Lora --\")\n",
+    "    compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n",
+    "    compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n",
+    "    compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "    \n",
+    "    print(\"-- W2/W1/W3 --\")\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n",
+    "    compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    \n",
+    "    print(\"-- Attention --\")\n",
+    "    compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n",
+    "    hidden_size = 768\n",
+    "    qProjSize = 64\n",
+    "    num_heads = 12\n",
+    "    num_new_tokens = num_tokens = 24\n",
+    "    if attention_tests:\n",
+    "        # compare attn weight tensors\n",
+    "        ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n",
+    "        ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n",
+    "        \n",
+    "        hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n",
+    "        \n",
+    "        assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n",
+    "        \n",
+    "        # Compare attn outproj grad in tensors\n",
+    "        compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n",
+    "        \n",
+    "        ########### Compare value projs grads ######################\n",
+    "        # 1. compare qk prods softmax\n",
+    "        hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.qk_prods_softmax.output_0\"\n",
+    "        ff_attn_qk_prods_softmax = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax\"\n",
+    "        \n",
+    "        hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n",
+    "        ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "\n",
+    "        for head_idx in range(num_heads):\n",
+    "            hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "            ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "            assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "        \n",
+    "        # 2. compare attn heads grads\n",
+    "        hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n",
+    "        ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "\n",
+    "        hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n",
+    "        ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n",
+    "        # NEED TO VISUALLY INSPECT\n",
+    "        compare_loaded_tensors(hf_attn_heads_grads, ff_attn_heads_grads)\n",
+    "\n",
+    "        # 3. vproj grads\n",
+    "        hf_vproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0\"\n",
+    "        ff_vproj_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_v_proj_in_grad\"\n",
+    "\n",
+    "        hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n",
+    "        ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n",
+    "        compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)\n",
+    "        \n",
+    "        \n",
+    "        ##############################\n",
+    "        hf_value_states = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.value_states.output_0\"\n",
+    "        hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "        # print(hf_value_states.shape)\n",
+    "        ff_value_states = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_vcache\"\n",
+    "        ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
+    "        # print(ff_value_states.shape)\n",
+    "        assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "        ########## Compare key and query projs grads ##################\n",
+    "        ff_devQKVPRojArray = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n",
+    "        ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n",
+    "        ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n",
+    "        ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n",
+    "        ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n",
+    "        assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n",
+    "\n",
+    "        # simulate qk_prods_softmax\n",
+    "        ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "        ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n",
+    "        ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n",
+    "        ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
+    "        ff_value_states = torch.from_numpy(ff_value_states)\n",
+    "        ff_value_states = ff_value_states.permute(1,0,2)\n",
+    "        # print(ff_attn_heads_grads.shape)\n",
+    "        # print(ff_value_states.shape)\n",
+    "        simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
+    "        #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
+    "        #print(\"Simulated QK prods grads:\")\n",
+    "        #print(simulated_qk_prods_softmax_grads[0,:,:])\n",
+    "\n",
+    "        # qk prods softmax right before softmax\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.qk_prods_softmax.go_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        \n",
+    "        mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n",
+    "        mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "        pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n",
+    "        print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n",
+    "        # print(hf_qk_prods_softmax2[:2,:,0])\n",
+    "        # print(ff_qk_prods_softmax2[:2,:,0])\n",
+    "        assert(pct_mismatch <= 0.1)\n",
+    "\n",
+    "        # qk prods softmax right after softmax\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.pre_softmax.gi_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        compare_loaded_tensors(hf_qk_prods_softmax2, ff_qk_prods_softmax2)\n",
+    "        \n",
+    "        # qk prods softmax after mask\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.matmul_op.go_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in_masked\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "\n",
+    "        # Compare query activation\n",
+    "        hf_query_activation = hf_path + f\"/fwd_step_0_layers.11.self_attn.query_activation.output_0\"\n",
+    "        hf_query_activation = torch.load(hf_query_activation)\n",
+    "        ff_query_activation = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_query_activation\"\n",
+    "        ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n",
+    "        hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "        # assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
+    "        # print(hf_query_activation[:,0,:])\n",
+    "        # print()\n",
+    "        # print(ff_query_activation[:,0,:])\n",
+    "        # assert False\n",
+    "        # compare_loaded_tensors(hf_query_activation, ff_query_activation)\n",
+    "        check_rope = False\n",
+    "        if check_rope:\n",
+    "        ########################################## ROPE and Kproj ##########################################\n",
+    "\n",
+    "            # Compare FF kproj with intermediate kproj data from HF\n",
+    "            hf_kproj_grads_post_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_post_rotary.go_0\"\n",
+    "            hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
+    "            hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
+    "            # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
+    "            # Check hf ROPE \n",
+    "            cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
+    "            cos = cos.cuda()\n",
+    "            sin = sin.cuda()\n",
+    "            # query_states:  torch.Size([1, 12, 24, 64])\n",
+    "            # key_states:  torch.Size([1, 12, 24, 64])\n",
+    "            # position_ids:  torch.Size([1, 24])\n",
+    "            # tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "            #          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "            query_states = torch.zeros([1, 12, 24, 64]).cuda()\n",
+    "            position_ids = torch.arange(24).unsqueeze(0).cuda()\n",
+    "            query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n",
+    "            hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
+    "            # print(hf_kproj_grads_post_rotary[:,:,0])\n",
+    "            \n",
+    "            hf_kproj_grads_before_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_before_rotary.go_0\"\n",
+    "            hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
+    "            hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
+    "            # print(hf_kproj_grads_before_rotary[:,:,0])\n",
+    "            # Compare HF rope with manual ROPE\n",
+    "            assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "            # Compare HF Kproj with FF Kproj (before ROPE) \n",
+    "            ff_kproj_pre = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj_pre\"\n",
+    "            ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "            # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
+    "            #print(ff_kproj_pre[:,:,0])\n",
+    "            mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "            mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "            pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
+    "            print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n",
+    "            assert(pct_mismatch <= 0.05)\n",
+    "            #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "            \n",
+    "            ff_kproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj\"\n",
+    "            ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "            # print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "            #print(ff_kproj[:,:,0])\n",
+    "            mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "            mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "            pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
+    "            print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n",
+    "            assert(pct_mismatch <= 0.05)\n",
+    "            #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "        \n",
+    "        \n",
+    "            #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
+    "            hf_kproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0\"\n",
+    "            hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
+    "            #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
+    "            #print(hf_kproj_grads[:,:64])\n",
+    "            reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "            #print(reshaped_tensor.shape)\n",
+    "            assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "        ########################################## Qproj (with ROPE) ##########################################\n",
+    "\n",
+    "        # Compare QProj\n",
+    "        hf_qproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0\"\n",
+    "        hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
+    "        # print(\"HF Qproj:\")\n",
+    "        # print(hf_qproj_grads.shape)\n",
+    "        reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "        # print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
+    "        # print(reshaped_tensor[:,:,0])\n",
+    "        ff_qproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n",
+    "        ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
+    "        # print(\"FF Qproj:\")\n",
+    "        # print(ff_qproj.shape)\n",
+    "        # print(ff_qproj[:,:,0])\n",
+    "        assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    hf_attn_in = f\"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0\"\n",
+    "    hf_attn_in = torch.load(hf_attn_in)\n",
+    "    hf_attn_in = hf_attn_in.squeeze().T\n",
+    "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
+    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    print(hf_attn_in)\n",
+    "\n",
+    "    ff_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in\"\n",
+    "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
+    "    print(\"ff_attn_in: \", ff_attn_in.shape)\n",
+    "    print(ff_attn_in)\n",
+    "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
+    "\n",
+    "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
+    "    mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
+    "    assert(pct_mismatch <= 0.1)\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.01614726  0.01363804  0.01768043 ...  0.00724926 -0.00149747\n",
+      " -0.01781223]\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = np.fromfile(\"/usr0/home/goliaro/.cache/flexflow/weights/goliaro/llama-160m-lora-full/full-precision/layers_11_feed_forward_w2_lora_A_weight\", dtype=np.float32)\n",
+    "print(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# value states: torch.Size([1, 12, 24, 64])\n",
+    "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n",
+    "key_states = value_states\n",
+    "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n",
+    "# query_states:  torch.Size([1, 12, 24, 64])\n",
+    "# key_states:  torch.Size([1, 12, 24, 64])\n",
+    "# position_ids:  torch.Size([1, 24])\n",
+    "# tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "#          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "query_states = torch.zeros([1, 12, 24, 64])\n",
+    "position_ids = torch.arange(24).unsqueeze(0)\n",
+    "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n",
+    "key_states = key_states.squeeze()\n",
+    "print(key_states.shape)\n",
+    "print(key_states[0,:,:])\n",
+    "print(hf_kproj_grads_before_rotary.shape)\n",
+    "print(hf_kproj_grads_before_rotary[:,:,0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+       "         18, 19, 20, 21, 22, 23]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.arange(24).unsqueeze(0).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 12, 24, 24])\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "layer_num = 11\n",
+    "hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n",
+    "ff_qk_prods_softmax = f\"{ff_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "hf_value_states = f\"{hf_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "\n",
+    "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n",
+    "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n",
+    "print(hf_qk_prods_softmax.shape)\n",
+    "#print(ff_qk_prods_softmax.shape)\n",
+    "#print(hf_qk_prods_softmax[:,:,0])\n",
+    "#print()\n",
+    "#print(ff_qk_prods_softmax[:,:,0])\n",
+    "\n",
+    "for head_idx in range(12):\n",
+    "    hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "    ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "    assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "\n",
+    "\n",
+    "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n",
+    "print(hf_value_states.shape)\n",
+    "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n",
+    "print()\n",
+    "print(attn_output.shape)\n",
+    "print(attn_output.transpose(1, 2).contiguous().shape)\n",
+    "print(\"Hf attn heads\")\n",
+    "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n",
+    "\n",
+    "print(\"Attn heads grads:\")\n",
+    "hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "print(torch.load(hf_attn_heads_grads).shape)\n",
+    "print(\"HF value grads:\")\n",
+    "vproj_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "print(torch.load(vproj_grads).shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([2, 3, 4])\n",
+      "torch.Size([4, 3, 2])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(2,3,4)\n",
+    "print(a.shape)\n",
+    "print(a.T.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[   0.0000,    0.0000,    0.0000,  ...,    0.0000,    0.0000,\n",
+      "             0.0000],\n",
+      "         [  27.8890,  -21.5089,   45.8214,  ...,    5.4010,  -10.8787,\n",
+      "            39.7619],\n",
+      "         [  19.2197,   27.4681,  -68.7141,  ...,  102.3280,   66.7925,\n",
+      "          -160.8711],\n",
+      "         ...,\n",
+      "         [  63.9532,   17.4273,  -29.4416,  ...,  101.6105,   67.5937,\n",
+      "          -198.4432],\n",
+      "         [  31.2799,   13.0724,  -44.7179,  ...,  132.4898,   42.3135,\n",
+      "          -194.4037],\n",
+      "         [  42.3453,  -16.2693,  -55.7386,  ...,   90.5921,   52.2032,\n",
+      "          -124.1802]]], device='cuda:0')\n",
+      "tensor([[[-1.1845e+06, -6.7460e+05,  7.4494e+05,  ..., -9.1441e+05,\n",
+      "          -1.4912e+05,  3.5769e+06],\n",
+      "         [-7.3920e+01, -7.9389e+01,  1.1027e+02,  ..., -7.3020e+01,\n",
+      "          -2.3540e+01,  3.4587e+02],\n",
+      "         [-5.3885e+01, -1.7373e+01, -1.9780e+01,  ...,  4.1291e+01,\n",
+      "           5.5099e+01,  5.5910e+01],\n",
+      "         ...,\n",
+      "         [-2.1948e+01, -3.2109e+01,  2.8364e+01,  ...,  3.4321e+01,\n",
+      "           5.0713e+01,  5.6592e+01],\n",
+      "         [-4.4339e+01, -2.8339e+01,  1.4070e+01,  ...,  6.2797e+01,\n",
+      "           3.0760e+01,  6.1743e+01],\n",
+      "         [-1.6287e+01, -5.0413e+01, -1.9940e+01,  ...,  4.3766e+01,\n",
+      "           4.7833e+01,  4.7295e+01]]], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n",
+    "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n",
+    "a = torch.load(a)\n",
+    "b = torch.load(b)\n",
+    "print(a)\n",
+    "print(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "for layer_num in range(12):\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n",
+    "    hf_w1_weight = f\"{hf_path}/layers.{layer_num}.mlp.gate_proj.weight\"\n",
+    "    ff_w1_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n",
+    "    hf_w3_weight = f\"{hf_path}/layers.{layer_num}.mlp.up_proj.weight\"\n",
+    "    ff_w3_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n",
+    "    hf_w2_weight = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    ff_w2_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/alignment/opt_alignment_tests.ipynb b/tests/peft/alignment/opt_alignment_tests.ipynb
new file mode 100644
index 0000000000..ca679b1857
--- /dev/null
+++ b/tests/peft/alignment/opt_alignment_tests.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch\n",
+    "from align_test_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "--- LM head ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "--- Final Norm ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "qProjSize = 64\n",
+    "num_heads = 12\n",
+    "num_tokens = 25\n",
+    "for i in range(tot_num_layers):\n",
+    "    hf_base = os.path.join(hf_path, f\"fwd_step_0_decoder.layers.{i}.\")\n",
+    "    ff_base = os.path.join(ff_path, f\"fwd_step_0_layers_{i}_layers_{i}_\")\n",
+    "    \n",
+    "    # LayerNorm\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n",
+    "    ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.output_0\"\n",
+    "    ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "\n",
+    "    # # Attention QKV proj\n",
+    "    # print(\"---Attn---\")\n",
+    "    # ff_tensor = ff_base + \"attention_shard_0_qkv_proj_output\"\n",
+    "    # ff_tensor = load_ff_tensor(ff_tensor, [qProjSize, num_heads, 3, num_tokens])\n",
+    "    # ff_q_proj = ff_tensor[:,:,0,:]\n",
+    "    # ff_k_proj = ff_tensor[:,:,1,:]\n",
+    "    # ff_v_proj = ff_tensor[:,:,2,:]\n",
+    "    # hf_q_proj = hf_base + \"self_attn.q_proj.output_0\"\n",
+    "    # hf_q_proj = load_hf_tensor(hf_q_proj).squeeze().T\n",
+    "    # hf_q_proj = hf_q_proj.reshape(12,64,25)\n",
+    "    # hf_q_proj = np.transpose(hf_q_proj, (1,0,2))\n",
+    "    # hf_k_proj = hf_base + \"self_attn.k_proj.output_0\"\n",
+    "    # hf_k_proj = load_hf_tensor(hf_k_proj).squeeze().T\n",
+    "    # hf_k_proj = hf_k_proj.reshape(12,64,25)\n",
+    "    # hf_k_proj = np.transpose(hf_k_proj, (1,0,2))\n",
+    "    # hf_v_proj = hf_base + \"self_attn.v_proj.output_0\"\n",
+    "    # hf_v_proj = load_hf_tensor(hf_v_proj).squeeze().T\n",
+    "    # hf_v_proj = hf_v_proj.reshape(12,64,25)\n",
+    "    # hf_v_proj = np.transpose(hf_v_proj, (1,0,2))\n",
+    "    # compare_loaded_tensors(hf_q_proj/np.sqrt(qProjSize), ff_q_proj)\n",
+    "    # compare_loaded_tensors(hf_k_proj, ff_k_proj)\n",
+    "    # compare_loaded_tensors(hf_v_proj, ff_v_proj)\n",
+    "\n",
+    "    # Compare attn bias, residuals\n",
+    "    print(\"--- Attn bias + residual ---\")\n",
+    "    ff_residual1 = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_input_1\"\n",
+    "    ff_residual2 = ff_base + \"attention_layer_norm_shard_0_output_0\"\n",
+    "    compare_flexflow_tensors(ff_residual1, ff_residual2)\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_residual2)\n",
+    "    ff_tensor = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_output_0\"\n",
+    "    hf_tensor = hf_base + \"final_layer_norm.input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    \n",
+    "    print(\"--- MLP ---\")\n",
+    "    hf_tensor = hf_base + \"fc1.input_0\"\n",
+    "    ff_tensor = ff_base + \"fc1_shard_0_input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    hf_tensor = hf_base + \"fc2.input_0\"\n",
+    "    ff_tensor = ff_base + \"fc2_shard_0_input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "# LM head\n",
+    "print(\"\\n--- LM head ---\")\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.input_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "# Final layer norm\n",
+    "print(\"\\n--- Final Norm ---\")\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.input_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "ff_tensor1 = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_input_activation\"\n",
+    "# compare_flexflow_tensors_shortest(ff_tensor, ff_tensor1)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.output_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_1\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_1\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_mean\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_2\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_rstd\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[17], line 22\u001b[0m\n\u001b[1;32m     19\u001b[0m compare_flexflow_tensors(ff_tensor, ff_tensor1)\n\u001b[1;32m     20\u001b[0m compare_tensors(hf_tensor, ff_tensor) \u001b[38;5;66;03m# fails\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Compare fwd input/output of layernorm\u001b[39;00m\n\u001b[1;32m     25\u001b[0m hf_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_decoder.final_layer_norm.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Compare backward pass\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n",
+    "\n",
+    "hf_tensor1 = hf_path + \"/bwd_step_0_decoder.final_layer_norm.go_0\"\n",
+    "compare_hf_tensors(hf_tensor, hf_tensor1)\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor1, ff_tensor)\n",
+    "\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_decoder.final_layer_norm.gi_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_0\"\n",
+    "ff_tensor1 = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_1\"\n",
+    "compare_flexflow_tensors(ff_tensor, ff_tensor1)\n",
+    "compare_tensors(hf_tensor, ff_tensor) # fails"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\n",
+      "HF: [ 0.0193019  -1.0467215   0.21579844 ...  0.04534929 -0.25642633\n",
+      "  0.10879952]\n",
+      "FF:[ 0.01458706 -1.02212262  0.20589906 ...  0.04446212 -0.25625792\n",
+      "  0.108039  ]\n",
+      "[ True False  True ...  True  True  True]\n",
+      "[    1     3     7 ... 19170 19174 19188]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 16\u001b[0m\n\u001b[1;32m     14\u001b[0m hf_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     15\u001b[0m ff_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_fc1_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_fc1_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# LORA input\u001b[39;00m\n\u001b[1;32m     20\u001b[0m hf_lora_A_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:32\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     29\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     30\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers):\n",
+    "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.self_attn_layer_norm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_layer_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "   \n",
+    "    hf_ffn_norm_in = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.input_0\"\n",
+    "    ff_ffn_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_ffn_norm_in, ff_ffn_norm_in)\n",
+    "    \n",
+    "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_1\"\n",
+    "    # compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    hf_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\"\n",
+    "    ff_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\"\n",
+    "    compare_tensors(hf_fc1_in, ff_fc1_in)\n",
+    "\n",
+    "\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.final_layer_norm.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\n",
+      "HF: [-0.00542103 -1.781267    0.16552497 ... -0.77217525 -0.5760026\n",
+      "  0.04363118]\n",
+      "FF:[ 0.03817766 -1.5644939   0.22477378 ... -0.94569921 -0.43960798\n",
+      " -0.06447437]\n",
+      "[False False False ... False False False]\n",
+      "[    0     1     2 ... 19197 19198 19199]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m ff_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     21\u001b[0m ff_FWD_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_FWD_norm_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_FWD_norm_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     23\u001b[0m compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n\u001b[1;32m     25\u001b[0m hf_BWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_decoder.final_layer_norm.gi_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:29\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     30\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "\n",
+    "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "\n",
+    "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "\n",
+    "# Compare fwd input/output of layernorm\n",
+    "hf_FWD_norm_in = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.input_0\"\n",
+    "hf_FWD_norm_out = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.output_0\"\n",
+    "ff_FWD_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\"\n",
+    "ff_FWD_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_FWD_norm_in, ff_FWD_norm_in)\n",
+    "compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n",
+    "\n",
+    "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
new file mode 100644
index 0000000000..16b46cfa81
--- /dev/null
+++ b/tests/peft/hf_finetune.py
@@ -0,0 +1,129 @@
+import os, sys, shutil
+import torch
+
+# Reproducibility
+import random
+import numpy as np
+
+torch.manual_seed(0)
+random.seed(0)
+np.random.seed(0)
+# torch.use_deterministic_algorithms(True)
+
+# import bitsandbytes as bnb
+import argparse
+import transformers
+
+if transformers.__version__ < "4.31.0":
+    raise RuntimeError(
+        "Please update the transformers library version to 4.31.0 or above"
+    )
+from datasets import load_dataset
+
+
+from hf_utils import *
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--peft-model-id", type=str, default="goliaro/llama-160m-lora")
+    parser.add_argument(
+        "--lora-alpha",
+        type=int,
+        default=-1,
+        help="The scaling coefficient for LoRA. Leave it set to -1 to use the original value from the HF config",
+    )
+    parser.add_argument(
+        "--lora-dropout",
+        type=float,
+        default=0.0,
+        help="The dropout rate for LoRA. Set it to -1 to use the original value from the HF config",
+    )
+    parser.add_argument("-lr", "--learning-rate", type=float, default=0.001)
+    parser.add_argument("-n", "--max-steps", type=int, default=2)
+    parser.add_argument(
+        "--optimizer", type=str, choices=["sgs", "adam", "adamw"], default="sgd"
+    )
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--publish-peft-with-id", type=str, default="")
+    parser.add_argument(
+        "--save-peft-tensors",
+        action="store_true",
+        help="Save PEFT hidden states and weights to file",
+    )
+    args = parser.parse_args()
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    # Get PEFT config, model, tokenizer, and optimizer type
+    peft_config = build_peft_config(args, finetuning=True)
+    tokenizer = get_peft_tokenizer(args, peft_config)
+    model = build_peft_model(args, peft_config)
+    optim_type = get_optim_type(args)
+
+    # Print model with PEFT
+    print(model)
+    for name, params in model.named_parameters():
+        print(name)
+    print_trainable_parameters(model)
+
+    # Add hooks to save PEFT tensors, save any weights of interest before finetuning
+    if args.save_peft_tensors:
+        make_debug_dirs()
+        register_peft_hooks(model)
+        save_peft_weights(model, target_modules=["lora", "lm_head", "down_proj"])
+
+    # Load fine-tuning dataset
+    data = load_dataset("Abirate/english_quotes")
+    # TODO: remove using of a single row
+    key_to_filter = "quote"
+    desired_value = "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”"
+    data = filter_dataset_for_debugging(data, key_to_filter, desired_value)
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+    # Training loop
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=data["train"],
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=1,
+            gradient_accumulation_steps=1,
+            max_grad_norm=None,  # Disable gradient clipping
+            warmup_steps=0,
+            max_steps=args.max_steps,
+            learning_rate=args.learning_rate,
+            fp16=True if not args.use_full_precision else False,
+            logging_steps=1,
+            output_dir=os.path.join(
+                args.output_dir if len(args.output_dir) > 0 else "./",
+                "lora_training_logs",
+            ),
+            optim=optim_type,
+            lr_scheduler_type=transformers.training_args.SchedulerType.CONSTANT,
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+        callbacks=[HFTrainingCallBack] if args.save_peft_tensors else None,
+    )
+    # silence the warnings. Please re-enable for inference!
+    model.config.use_cache = False
+
+    # for batch in trainer.get_train_dataloader():
+    #     print("First batch: ")
+    #     print(batch)
+    #     break
+
+    trainer.train()
+
+    save_finetuned_model(model, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
new file mode 100644
index 0000000000..7bfc560cc2
--- /dev/null
+++ b/tests/peft/hf_serve.py
@@ -0,0 +1,140 @@
+import argparse
+import torch
+import os, sys, shutil, json
+from peft import PeftModel, PeftConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    LlamaTokenizer,
+    GenerationConfig,
+)
+
+
+def peft_pre_forward_hook(module, input):
+    assert module.name is not None and module.decoding_step is not None
+    name = module.name.replace("base_model.model.model.", "")
+    print(
+        f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}"
+    )
+    print("Pre-Input: ", input[0].shape)
+    torch.save(
+        input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input"
+    )
+    # print("===")
+
+
+def peft_post_forward_hook(module, input, output):
+    assert module.name is not None and module.decoding_step is not None
+    name = module.name.replace("base_model.model.model.", "")
+    print(
+        f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}"
+    )
+    print("Post-Input/Output: ", input[0].shape, output[0].shape)
+    torch.save(
+        output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output"
+    )
+    print("===")
+    module.decoding_step += 1
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--peft-model-id", type=str, required=True)
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--max-length", type=int, default=50)
+    parser.add_argument("--prompt-file", type=str, required=True)
+    parser.add_argument("--do-sample", action="store_true", help="Use sampling")
+    parser.add_argument(
+        "--save-peft-tensors",
+        action="store_true",
+        help="Save PEFT hidden states and weights to file",
+    )
+    args = parser.parse_args()
+
+    # Check if prompt-file exists
+    if not os.path.isfile(args.prompt_file):
+        print(f"Error: {args.prompt_file} does not exist.")
+        return
+
+    # Get peft model config
+    config = PeftConfig.from_pretrained(args.peft_model_id)
+    
+    # Load the base model
+    model = AutoModelForCausalLM.from_pretrained(
+        config.base_model_name_or_path,
+        return_dict=True,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        device_map="auto",
+    )
+    # Load the Lora model
+    model = PeftModel.from_pretrained(model, args.peft_model_id)
+    print(model)
+    
+    # Get tokenizer
+    hf_config = AutoConfig.from_pretrained(
+        config.base_model_name_or_path, trust_remote_code=True
+    )
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            config.base_model_name_or_path,
+            use_fast=True,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.base_model_name_or_path,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    
+    # Generation config
+    generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path)
+    generation_config.do_sample = args.do_sample
+
+    # Register hooks to save tensors, if needed
+    if args.save_peft_tensors:
+        # Change working dir to folder storing this script
+        abspath = os.path.abspath(__file__)
+        dname = os.path.dirname(abspath)
+        os.chdir(dname)
+        # Create output dir
+        shutil.rmtree("./hf_peft_tensors")
+        os.makedirs("./hf_peft_tensors", exist_ok=True)
+        # Save weights
+        for name, params in model.named_parameters():
+            if "lora" in name:
+                torch.save(params, f"./hf_peft_tensors/{name}")
+                # params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+        # Save hidden states
+        for name, layer in dict(model.named_modules()).items():
+            if "lora_A.default" in name or "lora_B.default" in name:
+                layer.name = name
+                layer.decoding_step = 0
+                print(f"Adding hooks to layer {layer.name}")
+                layer.register_forward_pre_hook(peft_pre_forward_hook)
+                layer.register_forward_hook(peft_post_forward_hook)
+
+    # Run inference
+    # Read prompt-file into a list of strings
+    with open(args.prompt_file, "r") as f:
+        try:
+            prompt_list = json.load(f)
+        except json.JSONDecodeError:
+            print(f"Error: Unable to parse {args.prompt_file} as JSON.")
+            sys.exit(1)
+    
+    for i, prompt in enumerate(prompt_list):
+        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+        with torch.cuda.amp.autocast():
+            output_tokens = model.generate(
+                **batch, max_new_tokens=args.max_length, generation_config=generation_config
+            )
+        print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/peft/hf_train.py b/tests/peft/hf_train.py
new file mode 100644
index 0000000000..707fc9d0ae
--- /dev/null
+++ b/tests/peft/hf_train.py
@@ -0,0 +1,161 @@
+import os, sys
+
+# os.environ["CUDA_VISIBLE_DEVICES"]="0"
+import torch
+import torch.nn as nn
+
+# import bitsandbytes as bnb
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer
+import argparse
+from peft import LoraConfig, get_peft_model
+import transformers
+from datasets import load_dataset
+
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x):
+        return super().forward(x).to(torch.float32)
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
+    parser.add_argument("--lora-rank", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument(
+        "--lora-target-modules",
+        type=str,
+        default="down_proj",
+        help="Comma-separated list of layers from the base model to target",
+    )
+    parser.add_argument("--lora-dropout", type=float, default=0.05)
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--publish-peft-with-id", type=str, default="")
+    args = parser.parse_args()
+    model_name = args.model_name
+    use_full_precision = args.use_full_precision
+    lora_rank = args.lora_rank
+    lora_alpha = args.lora_alpha
+    lora_target_modules = args.lora_target_modules.split(",")
+    lora_dropout = args.lora_dropout
+    output_dir = args.output_dir
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
+        raise ValueError(
+            "Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the trained model"
+        )
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        device_map="auto",
+    )
+
+    # Get Tokenizer
+    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_name,
+            use_fast=True,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+
+    for param in model.parameters():
+        param.requires_grad = False  # freeze the model - train adapters later
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float32)
+
+    model.gradient_checkpointing_enable()  # reduce number of stored activations
+    model.enable_input_require_grads()
+
+    model.lm_head = CastOutputToFloat(model.lm_head)
+
+    config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_alpha,
+        # target_modules=["q_proj", "v_proj"],
+        # target_modules=["down_proj"],
+        target_modules=lora_target_modules,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    print(model)
+    print(model.named_parameters())
+    model = get_peft_model(model, config)
+    print_trainable_parameters(model)
+
+    data = load_dataset("Abirate/english_quotes")
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=data["train"],
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=4,
+            gradient_accumulation_steps=4,
+            warmup_steps=100,
+            max_steps=200,
+            learning_rate=2e-4,
+            fp16=True if not use_full_precision else False,
+            logging_steps=1,
+            output_dir=os.path.join(
+                output_dir if len(output_dir) > 0 else "./", "lora_training_logs"
+            ),
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+    )
+    model.config.use_cache = (
+        False
+    )  # silence the warnings. Please re-enable for inference!
+    trainer.train()
+
+    if len(output_dir) > 0:
+        print(f"Done training! Saving the model to {output_dir}...")
+        model.save_pretrained(output_dir)
+
+    if len(publish_peft_with_id) > 0:
+        print(
+            f"Done training! Uploading the model to HF hub with id: {publish_peft_with_id}..."
+        )
+        model.push_to_hub(publish_peft_with_id, use_auth_token=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py
new file mode 100644
index 0000000000..9332c803b2
--- /dev/null
+++ b/tests/peft/hf_utils.py
@@ -0,0 +1,352 @@
+import torch
+import torch.nn as nn
+import transformers
+from transformers import (
+    TrainerCallback,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LlamaTokenizer,
+)
+import os, shutil
+from peft import PeftConfig, PeftModel
+from datasets import load_dataset, DatasetDict
+
+debug_dir = None
+debug_subdirs = ["fwd", "bwd", "optim", "weights"]
+verbose = False
+
+
+def make_debug_dirs():
+    global debug_dir
+    global debug_subdirs
+    debug_dir = os.environ.get("FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow"))
+    debug_dir = os.path.join(debug_dir, "debug", "huggingface")
+    shutil.rmtree(debug_dir, ignore_errors=True)
+    os.makedirs(debug_dir, exist_ok=True)
+    assert debug_dir is not None
+    assert os.path.isdir(debug_dir)
+    for subdir in debug_subdirs:
+        subdir_path = os.path.join(debug_dir, subdir)
+        os.makedirs(subdir_path, exist_ok=False)
+
+
+def get_dst_folder(subdir, step_idx=0):
+    global debug_dir, debug_subdirs
+    assert subdir in debug_subdirs
+    dst_folder = os.path.join(debug_dir, subdir, f"step_{step_idx}")
+    os.makedirs(dst_folder, exist_ok=True)
+    return dst_folder
+
+
+def simplify_name(name):
+    return name.replace("base_model.model.model.", "").replace("base_model.model.", "")
+
+
+def get_optim_type(args):
+    if args.optimizer == "sgd":
+        return transformers.training_args.OptimizerNames.SGD
+    elif args.optimizer == "adam":
+        return transformers.training_args.OptimizerNames.ADAM
+    elif args.optimizer == "adamw":
+        return transformers.training_args.OptimizerNames.ADAMW
+    else:
+        raise ValueError(f"Optimizer {args.optimizer} not supported")
+
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x):
+        return super().forward(x).to(torch.float32)
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def peft_backward_hook(module, grad_input, grad_output):
+    assert type(grad_input) == tuple and type(grad_output) == tuple
+    if len(grad_input) == 0 or len(grad_output) == 0:
+        return
+    assert module.name is not None and module.bwd_step is not None
+    name = simplify_name(module.name)
+    if verbose:
+        print(
+            f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}"
+        )
+        print("Backward GRAD Output:")
+    for i, out_grad in enumerate(grad_output):
+        if type(out_grad) == torch.Tensor:
+            dst_folder = get_dst_folder("bwd", module.bwd_step)
+            dst_filepath = os.path.join(dst_folder, f"{name}.output_gradient_{i}")
+            if verbose:
+                print("\t", out_grad.shape)
+                print(f"\t\tSaving to {dst_filepath}")
+            torch.save(out_grad, dst_filepath)
+        else:
+            if verbose:
+                print(out_grad)
+    if verbose:
+        print("Backward GRAD Input:")
+    for i, in_grad in enumerate(grad_input):
+        if type(in_grad) == torch.Tensor:
+            dst_folder = get_dst_folder("bwd", module.bwd_step)
+            dst_filepath = os.path.join(dst_folder, f"{name}.input_gradient_{i}")
+            if verbose:
+                print("\t", in_grad.shape)
+                print(f"\t\tSaving to {dst_filepath}")
+            torch.save(in_grad, dst_filepath)
+        else:
+            if verbose:
+                print(in_grad)
+    if verbose:
+        print("===")
+    module.bwd_step += 1
+
+
+def peft_forward_hook(module, input, output):
+    if len(input) == 0 or len(output) == 0:
+        return
+    assert module.name is not None and module.fwd_step is not None
+    name = simplify_name(module.name)
+    if verbose:
+        print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}")
+        print("Input:")
+    if type(input) == torch.Tensor:
+        if verbose:
+            print(input.shape)
+        dst_folder = get_dst_folder("fwd", module.fwd_step)
+        dst_filepath = os.path.join(dst_folder, f"{name}.input_0")
+        torch.save(input, dst_filepath)
+    elif type(input) == tuple:
+        for i, inp in enumerate(input):
+            if type(inp) == torch.Tensor:
+                if verbose:
+                    print(inp.shape)
+                dst_folder = get_dst_folder("fwd", module.fwd_step)
+                dst_filepath = os.path.join(dst_folder, f"{name}.input_{i}")
+                torch.save(inp, dst_filepath)
+            else:
+                if verbose:
+                    print(inp)
+    else:
+        assert False
+    if verbose:
+        print("Output:")
+    if type(output) == torch.Tensor:
+        if verbose:
+            print(output.shape)
+        dst_folder = get_dst_folder("fwd", module.fwd_step)
+        dst_filepath = os.path.join(dst_folder, f"{name}.output_0")
+        torch.save(output, dst_filepath)
+    elif type(output) == tuple:
+        for i, out in enumerate(output):
+            if type(out) == torch.Tensor:
+                if verbose:
+                    print(out.shape)
+                dst_folder = get_dst_folder("fwd", module.fwd_step)
+                dst_filepath = os.path.join(dst_folder, f"{name}.output_{i}")
+                torch.save(out, dst_filepath)
+            else:
+                if verbose:
+                    print(out)
+    else:
+        assert False
+    if verbose:
+        print("===")
+    module.fwd_step += 1
+
+
+def peft_optimizer_hook(model_, callback_func_handle):
+    def post_hook(optimizer, args, kwargs):
+        if verbose:
+            print("Optimizer Hook activated")
+        bwd_step = callback_func_handle.step_count
+        for name_, module in model_.named_modules():
+            name = simplify_name(name_)
+            for param_name, param in module.named_parameters(recurse=False):
+                if param.requires_grad:
+                    if verbose:
+                        print(
+                            f"Step #{bwd_step}: Saving weight gradient for {name} ({param.grad.shape})"
+                        )
+                    dst_folder = get_dst_folder("weights", bwd_step)
+                    dst_filepath = os.path.join(dst_folder, f"{name}.gradient")
+                    torch.save(param.grad, dst_filepath)
+
+    return post_hook
+
+
+class HFTrainingCallBack(TrainerCallback):
+    def on_train_begin(self, args, state, control, **kwargs):
+        if verbose:
+            print("Starting finetuning")
+        model_ = kwargs.get("model", None)
+        optim = kwargs.get("optimizer", None)
+        assert model_ is not None
+        assert optim is not None
+        self.step_count = 0
+        optim.optimizer.register_step_post_hook(peft_optimizer_hook(model_, self))
+
+    def save_lora_weights(self, model, pre_finetuning=False):
+        lora_weights_handles = [
+            (simplify_name(name), params)
+            for name, params in model.named_parameters()
+            if "lora" in name
+        ]
+        for simplified_name, params in lora_weights_handles:
+            dst_folder = get_dst_folder("weights", self.step_count)
+            if pre_finetuning:
+                dst_filepath = os.path.join(dst_folder, f"{simplified_name}_original")
+                torch.save(params, dst_filepath)
+                if verbose:
+                    print(
+                        f"Step #{self.step_count}: Saving ORIGINAL weight {simplified_name} ({params.shape})"
+                    )
+            else:
+                dst_filepath = os.path.join(dst_folder, f"{simplified_name}_finetuned")
+                torch.save(params, dst_filepath)
+                if verbose:
+                    print(
+                        f"Step #{self.step_count}: Saving FINETUNED weight {simplified_name} ({params.shape})"
+                    )
+        if not pre_finetuning:
+            self.step_count += 1
+
+    def on_step_end(
+        self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs
+    ):
+        self.save_lora_weights(model, pre_finetuning=False)
+
+    def on_step_begin(
+        self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs
+    ):
+        self.save_lora_weights(model, pre_finetuning=True)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if verbose:
+            print(f"Finetuning ended after {self.step_count} steps")
+
+
+def build_peft_config(args, finetuning=False):
+    peft_config = PeftConfig.from_pretrained(args.peft_model_id)
+    if peft_config.peft_type != "LORA":
+        raise ValueError(f"PEFT type {peft_config.peft_type} not supported yet")
+    if args.lora_alpha > 0.0:
+        peft_config.lora_alpha = args.lora_alpha
+    if peft_config.lora_dropout >= 0.0:
+        peft_config.lora_dropout = args.lora_dropout
+    # prevent HF from re-inizialing the weights randomly if finetuning
+    if finetuning:
+        peft_config.init_lora_weights = False
+    return peft_config
+
+
+def prepare_model_for_lora_finetuning(model, save_peft_tensors=False):
+    # Freeze all layers except the LORA ones. Cast small layers to full precision for stability
+    for name, param in model.named_parameters():
+        if "lora" not in name:
+            param.requires_grad = False  # freeze the model - train adapters later
+        else:
+            param.requires_grad = True
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float32)
+    if not save_peft_tensors:
+        model.gradient_checkpointing_enable()  # reduce number of stored activations
+    model.enable_input_require_grads()
+    model.lm_head = CastOutputToFloat(model.lm_head)
+    return model
+
+
+def build_peft_model(args, peft_config):
+    # Load base model, and apply the PEFT layer
+    model = AutoModelForCausalLM.from_pretrained(
+        peft_config.base_model_name_or_path,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        device_map="auto",
+    )
+    model = PeftModel.from_pretrained(model, args.peft_model_id, config=peft_config)
+    model = prepare_model_for_lora_finetuning(model, args.save_peft_tensors)
+    return model
+
+
+def get_peft_tokenizer(args, peft_config):
+    # Get Tokenizer
+    hf_config = AutoConfig.from_pretrained(
+        peft_config.base_model_name_or_path, trust_remote_code=True
+    )
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            peft_config.base_model_name_or_path,
+            use_fast=True,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            peft_config.base_model_name_or_path,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+    return tokenizer
+
+
+def register_peft_hooks(model):
+    # Save hidden states and gradients
+    for name, layer in dict(model.named_modules()).items():
+        layer.name = name
+        layer.fwd_step = 0
+        layer.bwd_step = 0
+        if verbose:
+            print(f"Adding hooks to layer {layer.name}")
+        layer.register_forward_hook(peft_forward_hook)
+        layer.register_full_backward_hook(peft_backward_hook)
+
+
+def save_peft_weights(model, target_modules=[]):
+    # Save any weights of interest
+    for name, params in model.named_parameters():
+        simplified_name = simplify_name(name)
+        for target_module in target_modules:
+            if target_module in name:
+                dst_folder = get_dst_folder("weights")
+                dst_filepath = os.path.join(dst_folder, f"{simplified_name}")
+                torch.save(params, dst_filepath)
+
+
+def filter_dataset_for_debugging(data, key_to_filter, desired_value):
+    filtered_dataset_dict = DatasetDict()
+    for split, dataset in data.items():
+        filtered_dataset = dataset.filter(
+            lambda example: example[key_to_filter] == desired_value
+        )
+        filtered_dataset_dict[split] = filtered_dataset
+    data = filtered_dataset_dict
+    return data
+
+
+def save_finetuned_model(model, args):
+    if len(args.output_dir) > 0:
+        if verbose:
+            print(f"Saving the model to {args.output_dir}...")
+        model.save_pretrained(args.output_dir)
+
+    if len(args.publish_peft_with_id) > 0:
+        if verbose:
+            print(
+                f"Uploading the model to HF hub with id: {args.publish_peft_with_id}..."
+            )
+        model.push_to_hub(args.publish_peft_with_id, use_auth_token=True)
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
new file mode 100644
index 0000000000..266bb64137
--- /dev/null
+++ b/tests/peft/peft_alignment_test.py
@@ -0,0 +1,730 @@
+import numpy as np
+import os, torch, argparse
+from alignment.align_test_utils import *
+from transformers import AutoConfig
+from peft import PeftConfig
+from tqdm import tqdm
+
+class AlignmentTest:
+    def __init__(self, model_name, tp_degree=1):
+        raise NotImplementedError()
+    def check_weights_alignment(self):
+        raise NotImplementedError()
+    def check_fwd_pass(self):
+        raise NotImplementedError()
+    def check_bwd_pass(self):
+        raise NotImplementedError()
+    def check_step(self, step_idx, learning_rate=0.001):
+        raise NotImplementedError()
+
+class LllamaAlignmentTest(AlignmentTest):
+    def __init__(self, model_name, tp_degree=1):
+        self.model_name = model_name
+        self.peft_config = PeftConfig.from_pretrained(model_name)
+        self.hf_config = AutoConfig.from_pretrained(self.peft_config.base_model_name_or_path)
+        self.num_layers = self.hf_config.num_hidden_layers
+        self.hidden_size = self.hf_config.hidden_size
+        self.intermediate_size = self.hf_config.intermediate_size
+        self.num_attention_heads = self.hf_config.num_attention_heads
+        self.num_key_value_heads = self.num_attention_heads
+        self.projsize = self.hidden_size // self.num_attention_heads
+        self.tp_degree = tp_degree
+        self.lora_scaling_factor = self.peft_config.lora_alpha / self.peft_config.r
+
+        self.num_tokens = None
+        self.ff_batch_size = None
+    
+
+    def check_weights_alignment(self):
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "lm_head.weight":
+                f_version = f"layers.{self.num_layers-1}.lm_head.weight_0"
+            elif hf_filename == "norm.weight":
+                f_version = f"layers.{self.num_layers-1}.norm.weight_0"
+            else:
+                f_version = ""
+                if hf_filename.startswith("layers."):
+                    layernum = hf_filename.split("layers.")[1].split(".")[0]
+                    f_version += f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # compute weight index, then rename lora if needed if needed
+                weight_index="0"
+                if "lora_A" in f_version:
+                    weight_index="A"
+                elif "lora_B" in f_version:
+                    weight_index="B"
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+                if f_version.endswith(".weight"):
+                    if weight_index == "0":
+                        f_version += f"_{weight_index}"
+                    else:
+                        f_version += f"_{weight_index}.original"
+                elif f_version.endswith(".gradient"):
+                    prefix = f_version.split(".gradient")[0]
+                    f_version = prefix + f".weight_{weight_index}.gradient"
+            return f_version
+        def get_tp_partition_dim(ff_weight_name) -> int:
+            # MLP layers split the intermediate size dimension
+            # gate_proj, up_proj: [hidden_size, intermediate_size]
+            # down_proj: [intermediate_size, hidden_size]
+            if self.tp_degree == 1:
+                return -1
+            if "lora.weight_B" in ff_weight_name:
+                return -1
+            if "lm_head" in ff_weight_name or "norm" in ff_weight_name:
+                return 1
+            if "gate_proj" in ff_weight_name or "up_proj" in ff_weight_name:
+                return 1
+            elif "down_proj" in ff_weight_name:
+                return 0
+            else:
+                return -1
+        print("-- Weights alignment --")
+        hf_weights_folder = os.path.join(hf_path, "weights", "step_0")
+        ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0")
+        files_list = os.listdir(hf_weights_folder)
+        for hf_weight_name in tqdm(sorted(files_list)):
+            if hf_weight_name.endswith(".weight"):
+                ff_weight_name = convert_hf_filename_to_ff(hf_weight_name)
+                # print(hf_weight_name, ff_weight_name)
+                hf_w_path = os.path.join(hf_weights_folder, hf_weight_name)
+                ff_w_path = os.path.join(ff_weights_folder, ff_weight_name)
+                if not os.path.isfile(hf_w_path):
+                    print(f"File '{hf_w_path}' not found")
+                if not os.path.isfile(ff_w_path):
+                    print(f"File '{ff_w_path}' not found")
+                assert(os.path.isfile(hf_w_path))
+                assert(os.path.isfile(ff_w_path))
+
+                # 1. get shape of hf weight
+                hf_weight = torch.load(hf_w_path, map_location='cpu')
+                hf_weigth_shape = hf_weight.shape
+                ff_partition_dim = get_tp_partition_dim(ff_weight_name)
+                ff_weigth_shape = list(hf_weigth_shape)[::-1]
+                if ff_partition_dim >= 0:
+                    ff_weigth_shape[ff_partition_dim] //= self.tp_degree
+                
+                # 2. handle flexflow shards in case of tensor parallelism
+                ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weigth_shape) for tp_idx in range(self.tp_degree)]
+                if self.tp_degree > 1:
+                    if ff_partition_dim >= 0:
+                        ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim)
+                    else:
+                        assert(are_np_arrays_identical(ff_weights))
+                        ff_weight = ff_weights[0]
+                else:
+                    ff_weight = ff_weights[0]
+                ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype)
+                
+                # check equivalence
+                try:
+                    torch.testing.assert_close(ff_weight, hf_weight.T)
+                except Exception as e:
+                    print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n")
+                    raise e
+    
+    def check_fwd_pass(self, step_idx=0):
+        hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}")
+        ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
+        
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "embed_tokens":
+                f_version = f"layers.0.embed_tokens"
+            elif hf_filename == "lm_head" or hf_filename == "norm":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}"
+            else:
+                assert hf_filename.startswith("layers.")
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix
+                f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "")
+                # lora in HuggingFace is split into A and B operators, in FF we use a single operator.
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+            return f_version
+        
+        def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
+            hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}"
+            hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            if hf_tensor_name == "embed_tokens":
+                self.num_tokens = hf_tensor.shape[1]
+            return hf_tensor
+        
+        def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE):
+            ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else ""
+            ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else ""
+            ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}"
+            ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename)
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[0] //= self.tp_degree
+            
+            if "layers.0.embed_tokens.input_0" in ff_tensor_path:
+                # get number of tokens
+                ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',')
+                self.ff_batch_size = ff_tensor.shape[0]
+
+            ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=0)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=0)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens)
+            return ff_tensor
+
+        def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            if additional_ff_tensor is not None:
+                additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype)
+                ff_tensor = ff_tensor - additional_ff_tensor
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance)
+                if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                raise e
+
+        print(f"-- FWD pass {step_idx}--")
+
+        # Embedding layer
+        hf_tensor_name = "embed_tokens"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding input")
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding output")
+        
+        # Transformers blocks
+        for i in range(self.num_layers):
+            # Input laye norm
+            hf_tensor_name = f"layers.{i}.input_layernorm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            if i == 0:
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+                output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            else:
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+                output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} input")
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} output")
+
+            # Attention
+            hf_tensor_name = f"layers.{i}.self_attn.o_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
+            
+            # Post-attention layernorm
+            hf_tensor_name = f"layers.{i}.post_attention_layernorm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Post-attention layernorm {i} output")
+
+            # W1 (gate_proj)
+            hf_tensor_name = f"layers.{i}.mlp.gate_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W1 {i} output")
+
+            # W3 (up_proj)
+            hf_tensor_name = f"layers.{i}.mlp.up_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} output")
+
+            # W2 (down_proj)
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} input")
+
+            hf_down_proj_in = hf_tensor.clone()
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+
+            # LoRA_A
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} input")
+            torch.testing.assert_close(hf_down_proj_in, hf_tensor, rtol=1.3e-6, atol=1e-5)
+
+            # LoRA intermediate
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="low_rank_activation", hf_tensor_idx=0, ff_tensor_idx=None)
+            hf_lora_A_out = get_hf_tensor(hf_tensor_name, output_comparison)
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
+            hf_lora_B_in = get_hf_tensor(hf_tensor_name, input_comparison)
+            torch.testing.assert_close(hf_lora_A_out, hf_lora_B_in, rtol=1.3e-6, atol=1e-5)
+            ff_tensor_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora"
+            ff_lora_A_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_lora_A_out.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_lora_A_out, ff_lora_A_out, label=f"LoRA_A {i} output")
+
+            # LoRA_B
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) * self.lora_scaling_factor
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_down_proj_out.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_down_proj_out, ff_tensor, label=f"W2_out + scaling*LoRA_B_out {i}")
+            compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_down_proj_out, label=f"LoRA_B {i} output")
+        
+        # Norm
+        hf_tensor_name = "norm"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Norm output")
+
+        # LM head
+        hf_tensor_name = "lm_head"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        compare(hf_tensor, ff_tensor, label="LM head input")
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        compare(hf_tensor, ff_tensor, label="LM head output")
+
+    def check_bwd_pass(self, step_idx=0):
+        if not self.num_tokens or not self.ff_batch_size:
+            raise ValueError("Number of tokens and batch size must be set before running backward pass check")
+        hf_bwd_folder = os.path.join(hf_path, "bwd", f"step_{step_idx}")
+        ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0")
+        
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "embed_tokens":
+                f_version = f"layers.0.embed_tokens"
+            elif hf_filename == "lm_head" or hf_filename == "norm":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}"
+            else:
+                assert hf_filename.startswith("layers.")
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix
+                # f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "")
+                # lora in HuggingFace is split into A and B operators, in FF we use a single operator.
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+            return f_version
+        
+        def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
+            hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}"
+            hf_tensor_path = os.path.join(hf_bwd_folder, hf_tensor_filename)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            return hf_tensor
+        
+        def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE, pre=False, shard_axis=0):
+            ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else ""
+            ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else ""
+            ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}"
+            
+            ff_tensor_path = os.path.join(ff_bwd_folder, ff_tensor_filename)
+            if pre:
+                ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre")
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[shard_axis] //= self.tp_degree
+
+            # exception: intermediate attention tensors
+            intermediate_attention_tensor = (
+                "self_attn" in ff_tensor_name and 
+                not (
+                    ff_tensor_name.endswith(".self_attn") and
+                    (
+                        tensor_comparison_idx.ff_tensor_type == "output_gradient" or
+                        tensor_comparison_idx.ff_tensor_type == "input_gradient"
+                    )
+                )
+            )
+            if not intermediate_attention_tensor:
+                ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=shard_axis)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=shard_axis)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            if not intermediate_attention_tensor:
+                ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens)
+            return ff_tensor
+
+        def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-3):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            if additional_ff_tensor is not None:
+                additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype)
+                ff_tensor = ff_tensor - additional_ff_tensor
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance)
+                if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .06 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                raise e
+        
+        print(f"-- BWD pass {step_idx}--")
+        
+        # LM head
+        hf_tensor_name = "lm_head"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        compare(hf_tensor, ff_tensor, label="LM head gradient output")
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, TPType.TO_REDUCE)
+        compare(hf_tensor, ff_tensor, label="LM head gradient input")
+
+        # Norm
+        hf_tensor_name = "norm"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        compare(hf_tensor, ff_tensor, label="Norm gradient output")
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Norm gradient input")
+
+        # Transformers blocks
+        for i in range(self.num_layers-1, -1, -1):
+            # W2 (down_proj) output
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output")
+
+            # LoRA_B
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor
+            compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output")
+
+            # LoRA_A
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input")
+
+            # W2 (down_proj) input
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input")
+            
+            # W2 input (HF) and SigmoidSiluMulti output (FF)
+            hf_w2_input = hf_tensor.clone()
+            ff_tensor_name = f"layers.{i}.SigmoidSiluMulti"
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_w2_input, ff_tensor, label=f"HF W2 {i} output and FF SSM output")
+
+            # W1 (gate_proj) output
+            hf_tensor_name = f"layers.{i}.mlp.gate_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W1 {i} gradient output")
+            # W1 (gate_proj) input
+            # HF W1 in = FF W1 in - HF W1 in (pre)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            ff_tensor_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE, pre=True)
+            compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_tensor_pre, label=f"W1 {i} gradient input")
+
+            # W3 (up_proj) output
+            hf_tensor_name = f"layers.{i}.mlp.up_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output")
+            # W3 (up_proj) input
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input")
+
+            # Attn O-proj
+            hf_tensor_name = f"layers.{i}.self_attn.o_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+            compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output")
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj"
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient input")
+
+            # V-proj grads
+            # FF shape: [num_tokens, qProjSize*num_heads]
+            hf_tensor_name = f"layers.{i}.self_attn.v_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            mixed_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, mixed_comparison)
+            hf_tensor = hf_tensor.squeeze().T
+            ff_tensor = get_ff_tensor(ff_tensor_name, mixed_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=1)
+            compare(hf_tensor, ff_tensor, label=f"V-proj {i} gradient input")
+
+            # K-proj grads
+            # FF shape: (num_tokens, qProjSize, num_heads)
+            hf_tensor_name = f"layers.{i}.self_attn.k_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            k_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="devkproj", hf_tensor_idx=0, ff_tensor_idx=None)
+            hf_tensor = get_hf_tensor(hf_tensor_name, k_proj_comparison)
+            hf_tensor = hf_tensor.squeeze().view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous()
+            hf_tensor = hf_tensor.T
+            ff_tensor = get_ff_tensor(ff_tensor_name, k_proj_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=2)
+            compare(hf_tensor, ff_tensor, label=f"K-proj {i} gradient input")
+            
+            # Q-proj grads
+            # FF shape (devQKVPRojArray): (num_tokens, qProjSize, num_heads, 3)
+            # Q-proj out grad: devQKVPRojArray[:,:,:,0]
+            hf_tensor_name = f"layers.{i}.self_attn.q_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.devQKVPRojArray"
+            q_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="", hf_tensor_idx=0, ff_tensor_idx=None)
+            hf_tensor = get_hf_tensor(hf_tensor_name, q_proj_comparison)
+            hf_tensor = hf_tensor.view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous().T
+            augmented_hf_tensor_shape = torch.Size([3]+list(hf_tensor.size()))
+            ff_tensor = get_ff_tensor(ff_tensor_name, q_proj_comparison, augmented_hf_tensor_shape, tp_type=TPType.PARTITION, shard_axis=2)[:,:,:,0]
+            compare(hf_tensor, ff_tensor, label=f"Q-proj {i} gradient input")
+            
+            # FF Attn input with HF layernorm out
+            hf_tensor_name = f"layers.{i}.input_layernorm"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_tensor, ff_tensor, label=f"Attn input {i} gradient input")
+
+            if i > 0:
+                # FF attn input with FF layernorm out 1
+                attn_input = ff_tensor.clone()
+                ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm"
+                _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1)
+                input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+                torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5)
+
+                # Input layernorm
+                
+                hf_tensor_name = f"layers.{i}.input_layernorm"
+                ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+                ff_in1_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=1)
+                input_layernorm0 = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+                input_layernorm1 = get_ff_tensor(ff_tensor_name, ff_in1_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+                torch.testing.assert_close(input_layernorm0, input_layernorm1, rtol=1.3e-6, atol=1e-5)
+                hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+                # if i > 1:
+                #     compare(hf_tensor, input_layernorm1, label=f"Input layernorm {i} gradient input")
+
+    def check_step(self, step_idx=0, learning_rate=0.001):
+        hf_weight_folder = os.path.join(hf_path, "weights", f"step_{step_idx}")
+        ff_weight_folder = os.path.join(ff_path, "weights", f"step_{step_idx}", "shard_0")
+        def convert_hf_filename_to_ff(hf_filename):
+            assert hf_filename.startswith("layers.")
+            layernum = hf_filename.split("layers.")[1].split(".")[0]
+            f_version = f"layers.{layernum}."
+            f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+            # lora in HuggingFace is split into A and B operators, in FF we use a single operator.
+            f_version = f_version.replace("lora_A", "lora.weight_A").replace("lora_B", "lora.weight_B")
+            return f_version
+        def get_hf_tensor(hf_tensor_name):
+            hf_tensor_path = os.path.join(hf_weight_folder, hf_tensor_name)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            return hf_tensor
+        def get_ff_tensor(ff_tensor_name, hf_shape, tp_type=TPType.REPLICATE, pre=False):
+            ff_tensor_path = os.path.join(ff_weight_folder, ff_tensor_name)
+            if pre:
+                ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre")
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[0] //= self.tp_degree
+            
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=0)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=0)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            return ff_tensor
+        def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance)
+                if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                raise e
+        print(f"-- optimizer pass {step_idx}--")
+        
+        for i in range(self.num_layers-1, -1, -1):
+            # LoRA_B gradient
+            hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_B.default.gradient"
+            hf_gradient = get_hf_tensor(hf_gradient_name)
+            hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_original"
+            hf_original_weight = get_hf_tensor(hf_original_weight_name)
+            hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_finetuned"
+            hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name)
+            torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5)
+            ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
+            ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE)
+            compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient")
+            # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0"
+            # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
+            # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0")
+            # ff_out_gradient = load_ff_tensor(os.path.join(ff_bwd_folder, ff_out_gradient_name), [self.hidden_size, 128])[:,:self.num_tokens]
+            # ff_out_gradient = torch.from_numpy(ff_out_gradient)
+            # print("Output gradient shape: ", ff_out_gradient.shape)
+            # ff_low_rank_activation = f"layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            # ff_low_rank_activation = load_ff_tensor(os.path.join(ff_fwd_folder, ff_low_rank_activation), [16, 128])[:,:self.num_tokens]
+            # ff_low_rank_activation = torch.from_numpy(ff_low_rank_activation)
+            # print("Low rank activation shape: ", ff_low_rank_activation.shape)
+            # simulated_weight_grad = ff_low_rank_activation @ ff_out_gradient.T
+            # print("Simulated weight grad shape: ", simulated_weight_grad.shape)
+            # print(simulated_weight_grad)
+            # print(ff_gradient)
+            # compare(hf_gradient, simulated_weight_grad, label=f"LoRA_B {i} simulated gradient")
+            
+
+            # LoRA_A gradient
+            hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_A.default.gradient"
+            hf_gradient = get_hf_tensor(hf_gradient_name)
+            ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
+            hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_original"
+            hf_original_weight = get_hf_tensor(hf_original_weight_name)
+            hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_finetuned"
+            hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name)
+            torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5)
+            ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
+            ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.PARTITION)
+            compare(hf_gradient, ff_gradient, label=f"LoRA_A {i} gradient")
+
+parser = argparse.ArgumentParser(description='Argument Parser Example') 
+# Adding arguments
+parser.add_argument('-m', '--model-name', type=str, default="goliaro/llama-160m-lora", help='Name of the model')
+parser.add_argument('-n', '--num-steps', type=int, default=1, help='Number of finetuning steps')
+parser.add_argument('-tp', '--tensor-parallelism-degree', type=int, default=1, help='The tensor parallelism degree used when running FlexFlow')
+parser.add_argument('-lr', '--learning-rate', type=float, default=0.001, help='The learning rate used at finetuning time')
+
+# Parse the arguments from command line
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree)
+    # llama_alignment.check_weights_alignment()
+    for i in range(args.num_steps):
+        llama_alignment.check_fwd_pass(i)
+        llama_alignment.check_bwd_pass(i)
+        llama_alignment.check_step(i, args.learning_rate)
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
new file mode 100755
index 0000000000..5600d57edf
--- /dev/null
+++ b/tests/peft_test.sh
@@ -0,0 +1,66 @@
+#! /usr/bin/env bash
+# set -x
+set -e
+
+cleanup() {
+    rm -rf ~/.cache/flexflow/debug
+}
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/.."
+
+# Token to access private huggingface models (e.g. LLAMA-2)
+HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none}
+if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then
+    huggingface-cli login --token "$HUGGINGFACE_TOKEN"
+fi
+
+# Clean up before test (just in case)
+cleanup
+
+# Create test prompt file
+mkdir -p ./inference/prompt
+echo '["Two things are infinite: "]' > ./inference/prompt/peft.json
+echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.”"]' > ./inference/prompt/peft_dataset.json
+
+
+# Create output folder
+mkdir -p ./inference/output
+
+# Enable backtrace in case we run into a segfault or assertion failure
+export LEGION_BACKTRACE=1
+
+# Download test model
+python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
+
+# Run PEFT in Huggingface to get ground truth tensors
+python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
+
+# Python test
+echo "Python test"
+python ./inference/python/ff_peft.py
+# Check alignment
+python ./tests/peft/peft_alignment_test.py -tp 2
+
+# C++ test
+echo "C++ test"
+./build/inference/peft/peft \
+    -ll:gpu 2 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 2 \
+    -ll:fsize 8192 -ll:zsize 12000 \
+    -llm-model JackFram/llama-160m \
+    -finetuning-dataset ./inference/prompt/peft_dataset.json \
+    -peft-model goliaro/llama-160m-lora \
+    -enable-peft \
+    --use-full-precision \
+    --inference-debugging
+# Check alignment
+python ./tests/peft/peft_alignment_test.py -tp 2
+
+# Print succeess message
+echo ""
+echo "PEFT tests passed!"
+echo ""
+
+# Cleanup after the test
+cleanup

From 0ba7c9f1a90fa4ae2b800fd852194e7b7d15dca8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 21 Sep 2024 12:41:04 -0700
Subject: [PATCH 21/44] Update nccl (#1507)

* update nccl

* fix

* update

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-7-136.us-east-2.compute.internal>
---
 cmake/nccl.cmake         | 200 +++++++++++++++------------------------
 deps/nccl                |   2 +-
 docker/run.sh            |  12 +--
 tests/inference_tests.sh |   3 -
 4 files changed, 81 insertions(+), 136 deletions(-)

diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index c140a44ec8..82cf3b4122 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -2,140 +2,88 @@ set(NCCL_NAME nccl)
 # set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}")
 # message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}")
 
-set(NCCL_URL "")
-if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
-  if(LINUX_VERSION MATCHES "20.04")
-    if (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz")
-    endif()
-  elseif(LINUX_VERSION MATCHES "18.04")
-    if (CUDA_VERSION VERSION_EQUAL "10.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "10.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz")
-    endif()
-  endif()
+if(NCCL_PATH)
+  set(NCCL_ROOT ${NCCL_PATH})
+else()
+  # if NCCL_PATH is not set, let's try to find it in the CUDA root
+  set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
 endif()
 
-if(NCCL_URL)
-  # Download and import pre-compiled NCCL library
-  message(STATUS "Using pre-compiled NCCL library")
-  message(STATUS "NCCL_URL: ${NCCL_URL}")
+find_library(NCCL_LIBRARY
+  NAMES libnccl${LIBEXT}
+  PATHS ${NCCL_ROOT} ${CUDA_ROOT}
+  PATH_SUFFIXES lib lib64
+  DOC "NCCL library." )
 
-  include(FetchContent)
-  FetchContent_Declare(${NCCL_NAME}
-    URL ${NCCL_URL}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-  )
-  FetchContent_GetProperties(${NCCL_NAME})
-  if(NOT ${NCCL_NAME}_POPULATED)
-    FetchContent_Populate(${NCCL_NAME})
-  endif()
-  
-  set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME})
-  set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include)
-  set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib)
-  message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}")
-  add_library(nccl SHARED IMPORTED)
-  set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH})
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  HINTS ${NCCL_ROOT}
+  PATH_SUFFIXES include 
+  DOC "NCCL include directory.")
 
-  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT})
-  install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include)
-  install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
-  
-else()
-  if(NCCL_PATH)
-    set(NCCL_ROOT ${NCCL_PATH})
+# find NCCL, set NCCL lib and include    
+if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
+  set(NCCL_FOUND ON)
+  set(NCCL_LIBRARIES ${NCCL_LIBRARY})
+  set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
+
+  # Check NCCL version
+  if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h")
+    file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES
+         REGEX "#define NCCL_MAJOR [0-9]+" )
+    file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2
+         REGEX "#define NCCL_MINOR [0-9]+" )
+    string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
+    string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
+    set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
+    if(NCCL_VERSION VERSION_LESS 2.23)
+      set(NCCL_OLD TRUE)
+    else()
+      set(NCCL_OLD FALSE)
+    endif()
+    message(STATUS "Found NCCL version: ${NCCL_VERSION}")
   else()
-    # if NCCL_PATH is not set, let's try to find it in the CUDA root
-    set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+    message(WARNING "NCCL header not found, unable to determine version")
+    set(NCCL_OLD TRUE)  # Assume old version if we can't determine
   endif()
-  
-  find_library(NCCL_LIBRARY
-    NAMES libnccl${LIBEXT}
-    PATHS ${NCCL_ROOT} ${CUDA_ROOT}
-    PATH_SUFFIXES lib lib64
-    DOC "NCCL library." )
+endif()
 
-  find_path(NCCL_INCLUDE_DIR
-    NAMES nccl.h
-    HINTS ${NCCL_ROOT}
-    PATH_SUFFIXES include 
-    DOC "NCCL include directory.")
-  
-  # find NCCL, set NCCL lib and include    
-  if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
-    set(NCCL_FOUND ON)
-    set(NCCL_LIBRARIES ${NCCL_LIBRARY})
-    set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  endif()
-  
-  # find NCCL
-  if(NCCL_FOUND)
-    list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
-    list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
-    message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
-    message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
-    add_library(nccl SHARED IMPORTED)
-  
-  # Build NCCL from source
-  else()
-    message(STATUS "Building NCCL from source")
-    list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
-  
-    ExternalProject_Add(${NCCL_NAME}
-      SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
-      PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
-      INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
-      BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
-      INSTALL_COMMAND ""
-      CONFIGURE_COMMAND ""
-      BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
-      BUILD_IN_SOURCE 1
-    )
+# find NCCL
+if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0))
+  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
+  message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
+  message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
+  add_library(nccl SHARED IMPORTED)
+
+# Build NCCL from source
+else()
+  message(STATUS "Building NCCL from source")
+  list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
 
-    ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
-    message(STATUS "NCCL install dir: ${INSTALL_DIR}")
-    list(APPEND FLEXFLOW_INCLUDE_DIRS
-      ${INSTALL_DIR}/include)
-    list(APPEND FLEXFLOW_EXT_LIBRARIES
-      ${INSTALL_DIR}/lib/libnccl${LIBEXT})
-    set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
-    
-    install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
-    install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
+  set(NCCL_BUILD_CMD make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}")
+  if(DEFINED ENV{MAKEFLAGS})
+    set(NCCL_BUILD_CMD ${CMAKE_COMMAND} -E env MAKEFLAGS=$ENV{MAKEFLAGS} ${NCCL_BUILD_CMD})
   endif()
+  ExternalProject_Add(${NCCL_NAME}
+    SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
+    PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
+    INSTALL_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${NCCL_BUILD_CMD}
+    BUILD_IN_SOURCE 1
+  )
 
+  ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
+  message(STATUS "NCCL install dir: ${INSTALL_DIR}")
+  list(APPEND FLEXFLOW_INCLUDE_DIRS
+    ${INSTALL_DIR}/include)
+  list(APPEND FLEXFLOW_EXT_LIBRARIES
+    ${INSTALL_DIR}/lib/libnccl${LIBEXT})
+  set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
+  
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
 endif()
diff --git a/deps/nccl b/deps/nccl
index 6e24ef4e1f..2ea4ee94bf 160000
--- a/deps/nccl
+++ b/deps/nccl
@@ -1 +1 @@
-Subproject commit 6e24ef4e1f1eac9f104d115ef65429f179924ee7
+Subproject commit 2ea4ee94bfb04c886c79ccae60ac9961000fdee2
diff --git a/docker/run.sh b/docker/run.sh
index cf105a10c8..cdf9383052 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -18,8 +18,6 @@ ATTACH_GPUS=${ATTACH_GPUS:-true}
 gpu_arg=""
 if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
 
-# Whether to attach inference weights / files (make sure to download the weights first)
-ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false}
 
 # Amount of shared memory to give the Docker container access to
 # If you get a Bus Error, increase this value. If you don't have enough memory
@@ -115,9 +113,11 @@ if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":lat
   exit 1
 fi
 
-inference_volumes=""
-if $ATTACH_INFERENCE_FILES ; then 
-  inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference";
+hf_token_volume=""
+hf_token_path="$HOME/.cache/huggingface/token"
+if [ -f "$hf_token_path" ]; then
+  # If the token exists, add the volume mount to the Docker command
+  hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
 fi
 
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
index 895b74c798..d173cce06d 100755
--- a/tests/inference_tests.sh
+++ b/tests/inference_tests.sh
@@ -25,9 +25,6 @@ fi
 # Clean up before test (just in case)
 cleanup
 
-# Make sure supported version of protobuf is installed
-pip3 install protobuf==3.20.3
-
 # Create test prompt file
 mkdir -p ../inference/prompt
 echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json

From 1f6350faaa06d9aa9c3a2ed963355fe4fe7876c7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 22 Sep 2024 18:21:22 -0400
Subject: [PATCH 22/44] speedup docker builds

---
 docker/flexflow-environment/Dockerfile | 38 ++++++++++++++++++--------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 3434916d6b..ee13a07375 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -42,17 +42,38 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         /opt/conda/bin/conda install conda-build conda-verify && \
         /opt/conda/bin/conda clean -ya
 
-# Optionally install HIP dependencies
+# set MAKEFLAGS to speedup any dependency that uses make
+ARG N_BUILD_CORES
+ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
+
+# Set env vars
+ENV PATH /opt/conda/bin:$PATH
+ENV CUDNN_DIR /usr/local/cuda
+ENV CUDA_DIR /usr/local/cuda
+
+# GPU-specific dependencies
+ARG FF_GPU_BACKEND "cuda"
+
+# Update NCCL if FF_GPU_BACKEND is cuda
+RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
+        ubuntu_version=$(lsb_release -rs); \
+        ubuntu_version=${ubuntu_version//./}; \
+        wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
+        DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
+        DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
+        rm -f cuda-keyring_1.0-1_all.deb; \
+        DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
+    else \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
+    fi'
+
+# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
 # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
 # package attempts to re-install cuda even though cuda is already installed
 # in the container. It also attempts to install packages for a graphical install.
 # For our container, we don't need `hip-runtime-nvidia`
-ARG FF_GPU_BACKEND "cuda"
 ARG hip_version "5.6"
-ARG N_BUILD_CORES
-# set MAKEFLAGS to speedup any dependency that uses make
-ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
-
 RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \
         # Check that hip_version is one of 5.3,5.4,5.5,5.6
@@ -83,11 +104,6 @@ RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]
     fi
 RUN rm -rf /var/lib/apt/lists/*
 
-# Set env vars
-ENV PATH /opt/conda/bin:$PATH
-ENV CUDNN_DIR /usr/local/cuda
-ENV CUDA_DIR /usr/local/cuda
-
 # Install python packages and other dependencies
 RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
 # Install CPU-only Pytorch and related dependencies

From 2e363c4955f2f80e965db4e2837b709597e83fe8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 22 Sep 2024 18:23:00 -0400
Subject: [PATCH 23/44] update

---
 docker/flexflow/Dockerfile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile
index 60f9d4d653..dff9259657 100644
--- a/docker/flexflow/Dockerfile
+++ b/docker/flexflow/Dockerfile
@@ -27,9 +27,7 @@ RUN for pair in $BUILD_CONFIGS; do \
 # Build and install C++ and Python versions of FlexFlow
 RUN mkdir -p build && cd build && \
     eval "$BUILD_CONFIGS" ../config/config.linux && \
-    make -j $N_BUILD_CORES && \
-    eval "$BUILD_CONFIGS" ../config/config.linux && \
-    make install && \
+    make -j $N_BUILD_CORES install && \
     ldconfig
 
 ENTRYPOINT ["/bin/bash"]

From 70e47b286370d2ff5feeb7949311881b987c0ac8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 25 Sep 2024 19:07:54 +0000
Subject: [PATCH 24/44] remove outdated code

---
 src/ops/residual_layer_norm.cpp | 21 ++++++++++++---------
 src/ops/residual_layer_norm.cu  | 21 ++++++++++++---------
 src/runtime/cuda_helper.cu      | 16 ++++++++++++++++
 3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index 582e0752ef..ed973b4f71 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -176,6 +176,8 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                      beta_ptr,
                      output_ptr);
 }
+
+#ifdef DEADCODE
 template <typename T>
 void save_inference_tensors(ResidualLayerNormMeta const *m) {
   if (m->inference_debugging) {
@@ -206,6 +208,7 @@ void save_inference_tensors(ResidualLayerNormMeta const *m) {
                 filename3.c_str());
   }
 }
+#endif
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
@@ -314,15 +317,15 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     }
   }
 
-  if (m->inference_debugging) {
-    if (m->input_type[0] == DT_FLOAT) {
-      save_inference_tensors<float>(m);
-    } else if (m->input_type[0] == DT_HALF) {
-      save_inference_tensors<half>(m);
-    } else {
-      assert(false && "unsupport datatype in layernorm");
-    }
-  }
+  // if (m->inference_debugging) {
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     save_inference_tensors<float>(m);
+  //   } else if (m->input_type[0] == DT_HALF) {
+  //     save_inference_tensors<half>(m);
+  //   } else {
+  //     assert(false && "unsupport datatype in layernorm");
+  //   }
+  // }
 
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 8cdf87a92c..50c81d2099 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -174,6 +174,8 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                    beta_ptr,
                    output_ptr);
 }
+
+#ifdef DEADCODE
 template <typename T>
 void save_inference_tensors(ResidualLayerNormMeta const *m) {
   if (m->inference_debugging) {
@@ -204,6 +206,7 @@ void save_inference_tensors(ResidualLayerNormMeta const *m) {
                 filename3.c_str());
   }
 }
+#endif
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
@@ -312,15 +315,15 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     }
   }
 
-  if (m->inference_debugging) {
-    if (m->input_type[0] == DT_FLOAT) {
-      save_inference_tensors<float>(m);
-    } else if (m->input_type[0] == DT_HALF) {
-      save_inference_tensors<half>(m);
-    } else {
-      assert(false && "unsupport datatype in layernorm");
-    }
-  }
+  // if (m->inference_debugging) {
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     save_inference_tensors<float>(m);
+  //   } else if (m->input_type[0] == DT_HALF) {
+  //     save_inference_tensors<half>(m);
+  //   } else {
+  //     assert(false && "unsupport datatype in layernorm");
+  //   }
+  // }
 
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 386a0c940b..42b3946f8c 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -278,6 +278,10 @@ __host__ void
       host_ptr, ptr, sizeof(float) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
+  if (!tensor_file) {
+    fprintf(stderr, "Error %i creating file %s\n", errno, file_name);
+    assert(false);
+  }
   assert(tensor_file != NULL);
   for (unsigned i = 0; i < num_elements; i++) {
     if (i < num_elements - 1) {
@@ -299,6 +303,10 @@ __host__ void
       host_ptr, ptr, sizeof(half) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
+  if (!tensor_file) {
+    fprintf(stderr, "Error %i creating file %s\n", errno, file_name);
+    assert(false);
+  }
   assert(tensor_file != NULL);
   for (unsigned i = 0; i < num_elements; i++) {
     if (i < num_elements - 1) {
@@ -321,6 +329,10 @@ __host__ void save_tensor(int32_t const *ptr,
       host_ptr, ptr, sizeof(int32_t) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
+  if (!tensor_file) {
+    fprintf(stderr, "Error %i creating file %s\n", errno, file_name);
+    assert(false);
+  }
   assert(tensor_file != NULL);
   for (unsigned i = 0; i < num_elements; i++) {
     if (i < num_elements - 1) {
@@ -343,6 +355,10 @@ __host__ void save_tensor(int64_t const *ptr,
       host_ptr, ptr, sizeof(int64_t) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
+  if (!tensor_file) {
+    fprintf(stderr, "Error %i creating file %s\n", errno, file_name);
+    assert(false);
+  }
   assert(tensor_file != NULL);
   for (unsigned i = 0; i < num_elements; i++) {
     if (i < num_elements - 1) {

From 9da554607063c3b17211238b3cc0e589d2cc50d9 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 25 Sep 2024 17:30:32 -0700
Subject: [PATCH 25/44] [Bug Fix] Update register interface (#1509)

* minor bug fix

* assign static variant ID to avoid mismatch between ranks
---
 src/runtime/model.cc | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index f46630db3c..ceb9277b76 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -7443,12 +7443,13 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<AdamOptimizer::ps_update_task>(
-          registrar, "Adam Parameter Server Update Task");
+          registrar, "Adam Parameter Server Update Task", 111 /*variant ID*/);
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AdamOptimizer::ps_update_task>(registrar);
+      runtime->register_task_variant<AdamOptimizer::ps_update_task>(
+          registrar, 111 /*variant ID*/);
     }
   }
 #ifdef FF_USE_NCCL
@@ -7459,12 +7460,13 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<SGDOptimizer::nccl_update_task>(
-          registrar, "SGD NCCL Update Task");
+          registrar, "SGD NCCL Update Task", 111 /*variant ID*/);
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<SGDOptimizer::nccl_update_task>(registrar);
+      runtime->register_task_variant<SGDOptimizer::nccl_update_task>(
+          registrar, 111 /*variant ID*/);
     }
   }
   {
@@ -7473,13 +7475,13 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<AdamOptimizer::nccl_update_task>(
-          registrar, "Adam NCCL Update Task");
+          registrar, "Adam NCCL Update Task", 111 /*variant ID*/);
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
       runtime->register_task_variant<AdamOptimizer::nccl_update_task>(
-          registrar);
+          registrar, 111 /*variant ID*/);
     }
   }
 #endif
@@ -7610,13 +7612,13 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
-          registrar, "NCCL Init Communicators Task");
+          registrar, "NCCL Init Communicators Task", 111 /*variant ID*/);
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
       runtime->register_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
-          registrar);
+          registrar, 111 /*variant ID*/);
     }
   }
   {
@@ -7626,12 +7628,13 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<Op::finish_nccl_comms_task>(
-          registrar, "NCCL Finish Communicators Task");
+          registrar, "NCCL Finish Communicators Task", 111 /*variant ID*/);
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<Op::finish_nccl_comms_task>(registrar);
+      runtime->register_task_variant<Op::finish_nccl_comms_task>(
+          registrar, 111 /*variant ID*/);
     }
   }
 #endif

From 64c258f3b43e19025889d728799d2bdedde9f732 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Fri, 27 Sep 2024 11:59:38 -0700
Subject: [PATCH 26/44] [FusedOp] Fix segment fault (#1511)

* minor bug fix

* fix
---
 src/ops/fused.cu | 69 ++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index cab28181da..8f1212beb4 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1678,77 +1678,77 @@ __host__ void FusedOp::backward_task(Task const *task,
     int sum = fused->numInputs + fused->numWeights + fused->numOutputs;
     assert(sum * 2 == (int)regions.size());
   }
-  GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS];
-  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
+  std::vector<GenericTensorAccessorR> input_accessor;
+  std::vector<GenericTensorAccessorW> input_grad_accessor;
+  std::vector<GenericTensorAccessorR> weight_accessor;
+  std::vector<GenericTensorAccessorW> weight_grad_accessor;
+  std::vector<GenericTensorAccessorR> output_accessor;
+  std::vector<GenericTensorAccessorW> output_grad_accessor;
   int roff = 0;
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    input_accessor[i] =
+    input_accessor.push_back(
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
                                          task->regions[i],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
   }
   roff += fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    weight_accessor[i] =
+    weight_accessor.push_back(
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
   }
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    output_accessor[i] =
+    output_accessor.push_back(
         helperGetGenericTensorAccessorRO(fused->output_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
   }
   roff += fused->numOutputs;
   for (int i = 0; i < fused->numInputs; i++) {
-    input_grad_accessor[i] =
+    input_grad_accessor.push_back(
         helperGetGenericTensorAccessorRW(fused->input_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
     assert(input_grad_accessor[i].domain == input_accessor[i].domain);
   }
   roff += fused->numInputs;
   for (int i = 0; i < fused->numWeights; i++) {
-    weight_grad_accessor[i] =
+    weight_grad_accessor.push_back(
         helperGetGenericTensorAccessorRW(fused->weight_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
     assert(weight_grad_accessor[i].domain.get_volume() ==
            weight_accessor[i].domain.get_volume());
   }
   roff += fused->numWeights;
   for (int i = 0; i < fused->numOutputs; i++) {
-    output_grad_accessor[i] =
+    output_grad_accessor.push_back(
         helperGetGenericTensorAccessorRW(fused->output_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
                                          ctx,
-                                         runtime);
+                                         runtime));
     assert(output_grad_accessor[i].domain == output_accessor[i].domain);
   }
   roff += fused->numOutputs;
@@ -1767,12 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
 
   int ioff = 0, woff = 0, ooff = 0;
-  GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS];
-  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
-  GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
   // Do backpropagation in the reverse ordering
   for (int op = 0; op < fused->numOperators; op++) {
     ioff += fused->op_num_inputs[op];
@@ -1781,18 +1775,24 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
 
   for (int op = fused->numOperators - 1; op >= 0; op--) {
+    std::vector<GenericTensorAccessorR> my_input_accessor;
+    std::vector<GenericTensorAccessorR> my_weight_accessor;
+    std::vector<GenericTensorAccessorR> my_output_accessor;
+    std::vector<GenericTensorAccessorW> my_input_grad_accessor;
+    std::vector<GenericTensorAccessorW> my_weight_grad_accessor;
+    std::vector<GenericTensorAccessorW> my_output_grad_accessor;
     ioff -= fused->op_num_inputs[op];
     woff -= fused->op_num_weights[op];
     ooff -= fused->op_num_outputs[op];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        my_input_accessor[i] = input_accessor[my_off];
-        my_input_grad_accessor[i] = input_grad_accessor[my_off];
+        my_input_accessor.push_back(input_accessor[my_off]);
+        my_input_grad_accessor.push_back(input_grad_accessor[my_off]);
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        my_input_accessor[i] = output_accessor[my_off];
-        my_input_grad_accessor[i] = output_grad_accessor[my_off];
+        my_input_accessor.push_back(output_accessor[my_off]);
+        my_input_grad_accessor.push_back(output_grad_accessor[my_off]);
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else {
         assert(false);
@@ -1800,17 +1800,18 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-      my_weight_grad_accessor[i] =
-          weight_grad_accessor[fused->op_weight_idx[i + woff]];
+      my_weight_accessor.push_back(
+          weight_accessor[fused->op_weight_idx[i + woff]]);
+      my_weight_grad_accessor.push_back(
+          weight_grad_accessor[fused->op_weight_idx[i + woff]]);
       assert(my_weight_grad_accessor[i].domain.get_volume() ==
              my_weight_accessor[i].domain.get_volume());
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       int my_off = fused->op_output_idx[i + ooff];
-      my_output_accessor[i] = output_accessor[my_off];
-      my_output_grad_accessor[i] = output_grad_accessor[my_off];
+      my_output_accessor.push_back(output_accessor[my_off]);
+      my_output_grad_accessor.push_back(output_grad_accessor[my_off]);
       assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain);
     }
     switch (fused->op_op_type[op]) {
@@ -1880,7 +1881,7 @@ __host__ void FusedOp::backward_task(Task const *task,
         int num_inputs = fused->op_num_inputs[op];
         Kernels::Concat::backward_kernel_wrapper(m,
                                                  my_output_grad_accessor[0],
-                                                 my_input_grad_accessor,
+                                                 my_input_grad_accessor.data(),
                                                  num_inputs,
                                                  m->legion_axis);
         break;

From c78cf04d348aa242c891c783e880e90806c88344 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 1 Oct 2024 20:03:18 -0700
Subject: [PATCH 27/44] enable disabling inference (#1516)

---
 .github/workflows/build.yml  |  12 ++--
 .github/workflows/gpu-ci.yml |   6 +-
 CMakeLists.txt               | 105 ++++++++++++-----------------------
 config/config.inc            |  20 +++----
 config/config.linux          |   6 +-
 spack/package.py             |   4 +-
 src/c/flexflow_c.cc          |  12 ++++
 src/ops/beam_topk.cu         |   2 +-
 src/runtime/model.cc         |   4 ++
 9 files changed, 77 insertions(+), 94 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ef5961bc87..63e0b9037a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -79,13 +79,13 @@ jobs:
           export FF_CUDA_ARCH=70
           export FF_HIP_ARCH=gfx1100,gfx1036
           export hip_version=5.6
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
 
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_EXAMPLES=ON
+            export FF_BUILD_TRAINING_EXAMPLES=ON
             export FF_BUILD_UNIT_TESTS=ON
           else 
-            export FF_BUILD_ALL_EXAMPLES=OFF
+            export FF_BUILD_TRAINING_EXAMPLES=OFF
             export FF_BUILD_UNIT_TESTS=OFF
           fi
 
@@ -106,13 +106,13 @@ jobs:
           export FF_CUDA_ARCH=70
           export FF_HIP_ARCH=gfx1100,gfx1036
           export hip_version=5.6
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_EXAMPLES=ON
+            export FF_BUILD_TRAINING_EXAMPLES=ON
             export FF_BUILD_UNIT_TESTS=ON
           else 
-            export FF_BUILD_ALL_EXAMPLES=OFF
+            export FF_BUILD_TRAINING_EXAMPLES=OFF
             export FF_BUILD_UNIT_TESTS=OFF
           fi
 
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 00ca2df603..6ca50027d1 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -175,7 +175,7 @@ jobs:
           export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
           export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           mkdir build
           cd build
           ../config/config.linux
@@ -262,8 +262,8 @@ jobs:
         run: |
           export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
-          export FF_BUILD_ALL_EXAMPLES=ON
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_TRAINING_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
           pip install . --verbose
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f06969ae04..4e24e1e54b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,6 +181,14 @@ include(legion)
 
 # Not build FlexFlow if BUILD_LEGION_ONLY is ON
 if(NOT BUILD_LEGION_ONLY)
+
+  # build binary options
+  option(FF_BUILD_INFERENCE "build all inference code and examples." ON)
+  option(FF_BUILD_TRAINING_EXAMPLES "build all training examples." OFF)
+  option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
+  option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
+  option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
+
   # NCCL
   if(FF_USE_NCCL)
     if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
@@ -271,18 +279,23 @@ if(NOT BUILD_LEGION_ONLY)
   file(GLOB_RECURSE FLEXFLOW_HDR
     LIST_DIRECTORIES False
     ${FLEXFLOW_ROOT}/include/*.h)
-  
-  #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
 
   file(GLOB_RECURSE FLEXFLOW_SRC
     LIST_DIRECTORIES False
     ${FLEXFLOW_ROOT}/src/*.cc)
-  
   list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
-  #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
 
-  set(FLEXFLOW_CPP_DRV_SRC
-    ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
+  # exclude inference files if FF_BUILD_INFERENCE is off
+  if(NOT FF_BUILD_INFERENCE)
+    list(REMOVE_ITEM FLEXFLOW_HDR "${FLEXFLOW_ROOT}/include/request_manager.h")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/inference_manager.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/batch_config.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/beam_search_batch_config.cc")
+    list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/tree_verify_batch_config.cc")
+  endif()
+
+  set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
 
   add_library(substitution_loader SHARED
     ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
@@ -297,6 +310,10 @@ if(NOT BUILD_LEGION_ONLY)
     file(GLOB_RECURSE FLEXFLOW_GPU_SRC
       LIST_DIRECTORIES False
       ${FLEXFLOW_ROOT}/src/*.cu)
+    
+    if(NOT FF_BUILD_INFERENCE)
+      list(REMOVE_ITEM FLEXFLOW_GPU_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cu")
+    endif()
 
     add_compile_definitions(FF_USE_CUDA)
 
@@ -452,27 +469,6 @@ if(NOT BUILD_LEGION_ONLY)
     set_property(TARGET flexflow PROPERTY CXX_STANDARD 14)
   endif()
 
-  # build binary
-  option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF)
-  option(FF_BUILD_RESNET "build resnet example" OFF)
-  option(FF_BUILD_RESNEXT "build resnext example" OFF)
-  option(FF_BUILD_ALEXNET "build alexnet example" OFF)
-  option(FF_BUILD_DLRM "build DLRM example" OFF)
-  option(FF_BUILD_XDL "build XDL example" OFF)
-  option(FF_BUILD_INCEPTION "build inception example" OFF)
-  option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
-  option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
-  option(FF_BUILD_MOE "build mixture of experts example" OFF)
-  option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
-  option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
-  option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
-  option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF)
-  option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF)
-  option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
-  option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
-  option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
-  option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
-
   if(FF_BUILD_UNIT_TESTS)
     set(BUILD_GMOCK OFF)
     add_subdirectory(deps/googletest)
@@ -488,89 +484,60 @@ if(NOT BUILD_LEGION_ONLY)
       add_subdirectory(tools/substitutions_to_dot)
     endif()
 
-  if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER)
+  if(FF_BUILD_INFERENCE)
+    add_compile_definitions(FF_BUILD_INFERENCE)
     # Ensure Rust is installed
     execute_process(COMMAND rustc --version
                   RESULT_VARIABLE RUST_COMMAND_RESULT
                   OUTPUT_VARIABLE RUSTC_OUTPUT
                   ERROR_QUIET)
     if(NOT RUST_COMMAND_RESULT EQUAL 0)
-      message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+      message(FATAL_ERROR 
+      "Rust is not installed on the system. Please install it by running: \n"
+      "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n"
+      "and follow the instructions on the screen.")
     endif()
     # Ensure Cargo is installed
     execute_process(COMMAND cargo --version
                     RESULT_VARIABLE CARGO_RESULT
                     OUTPUT_QUIET ERROR_QUIET)
     if(NOT CARGO_RESULT EQUAL 0)
-      message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+      message(FATAL_ERROR 
+      "Rust is installed, but cargo is not. Please install it by running: \n"
+      "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n"
+      "and follow the instructions on the screen.")
     endif()
     set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON)
     add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
     target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
     target_link_libraries(flexflow tokenizers_cpp)
   endif()
-  if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
+  
+  if (FF_BUILD_TRAINING_EXAMPLES)
     add_subdirectory(examples/cpp/ResNet)
-  endif()
-
-  if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/resnext50)
-  endif()
-
-  if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/AlexNet)
-  endif()
-
-  if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/MLP_Unify)
-  endif()
-
-  if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/split_test)
-  endif()
-
-  if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/split_test_2)
-  endif()
-
-  if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/InceptionV3)
-  endif()
-
-  #TODO: Once functional add to BUILD_ALL_EXAMPLES
-  if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/candle_uno)
-  endif()
-
-  if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/DLRM)
-
     #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
     #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-
     #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
     #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-  endif()
-
-  if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/XDL)
-  endif()
-
-  if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/Transformer)
-  endif()
-
-  if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(examples/cpp/mixture_of_experts)
   endif()
 
-  if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
+  if(FF_BUILD_INFERENCE)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
     add_subdirectory(inference/peft)
   endif()
 
-
   # installation
   set(INCLUDE_DEST "include")
   set(LIB_DEST "lib")
diff --git a/config/config.inc b/config/config.inc
index 6431eaf136..011fe890fb 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -128,19 +128,19 @@ elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
 fi
 
 # build C++ examples
-if [ "$FF_BUILD_ALL_EXAMPLES" = "ON" ]; then
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
-elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=OFF"
+if [ "$FF_BUILD_TRAINING_EXAMPLES" = "ON" ]; then
+  SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON"
+elif [ "$FF_BUILD_TRAINING_EXAMPLES" = "OFF" ]; then
+  SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=OFF"
 else
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
+  SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON"
 fi
-if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then
-  SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
-elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then
-  SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF"
+if [ "$FF_BUILD_INFERENCE" = "ON" ]; then
+  SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON"
+elif [ "$FF_BUILD_INFERENCE" = "OFF" ]; then
+  SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=OFF"
 else
-  SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
+  SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON"
 fi
 
 # enable C++ unit tests
diff --git a/config/config.linux b/config/config.linux
index acffc210f5..09976cfa03 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -65,8 +65,8 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
 UCX_DIR=${UCX_DIR:-""}
 
 # build C++ examples
-FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
-FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON}
+FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES:-OFF}
+FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE:-ON}
 
 # build C++ unit tests
 FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
@@ -108,7 +108,7 @@ fi
 
 function get_build_configs() {
     # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
+    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES} FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
diff --git a/spack/package.py b/spack/package.py
index 273cb30951..12ff294e94 100644
--- a/spack/package.py
+++ b/spack/package.py
@@ -91,9 +91,9 @@ def cmake_args(self):
       options.append('-DFF_USE_NCCL=OFF')
       
     if '+examples' in spec:
-      options.append('-DFF_BUILD_ALL_EXAMPLES=ON')
+      options.append('-DFF_BUILD_TRAINING_EXAMPLES=ON')
     else:
-      options.append('-DFF_BUILD_ALL_EXAMPLES=OFF')
+      options.append('-DFF_BUILD_TRAINING_EXAMPLES=OFF')
       
     if '+avx2' in spec:
       options.append('-DFF_USE_AVX2=ON')
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e39cb29037..532dd00198 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -16,7 +16,9 @@
 #include "flexflow/flexflow_c.h"
 #include "flexflow/dataloader.h"
 #include "flexflow/mapper.h"
+#ifdef FF_BUILD_INFERENCE
 #include "flexflow/request_manager.h"
+#endif
 #include "flexflow/utils/file_loader.h"
 
 using namespace Legion;
@@ -58,6 +60,7 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_dlrm_config_t, DLRMConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *);
   // inference
+#ifdef FF_BUILD_INFERENCE
   FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t,
                         TreeVerifyBatchConfig *);
@@ -74,6 +77,7 @@ class FFCObjectWrapper {
   //                       LoraAdamOptimizerConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
+#endif
 };
 
 Logger ffc_log("flexflow_c");
@@ -1549,6 +1553,7 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
+#ifdef FF_BUILD_INFERENCE
 flexflow_peft_model_id_t flexflow_model_add_lora_layer(
     flexflow_model_t handle_,
     const flexflow_lora_linear_config_t peft_config_) {
@@ -1563,6 +1568,7 @@ flexflow_peft_model_id_t flexflow_model_add_lora_layer(
               peft_model_id);
   return FFCObjectWrapper::wrap(peft_model_id);
 }
+#endif
 
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_,
                                       flexflow_sgd_optimizer_t optimizer_) {
@@ -1617,6 +1623,7 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
   handle->set_transformer_layer_id(id);
 }
 
+#ifdef FF_BUILD_INFERENCE
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
                              enum RequestType *request_types,
@@ -1697,6 +1704,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
     }
   }
 }
+#endif
 
 void flexflow_model_set_position_offset(flexflow_model_t handle_,
                                         int const offset) {
@@ -2584,6 +2592,8 @@ void flexflow_perform_registration(void) {
                                          true /*global*/);
 }
 
+#ifdef FF_BUILD_INFERENCE
+
 // -----------------------------------------------------------------------
 // BatchConfig
 // -----------------------------------------------------------------------
@@ -3052,3 +3062,5 @@ void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
   DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);
   delete peft_model_id;
 }
+
+#endif
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index bf4c23cad0..a7aee338e4 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -15,7 +15,7 @@
 
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/beam_topk.h"
-#include "flexflow/request_manager.h"
+// #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index ceb9277b76..5213633e73 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -71,7 +71,9 @@
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
+#ifdef FF_BUILD_INFERENCE
 #include "flexflow/request_manager.h"
+#endif
 #include "flexflow/substitution.h"
 #include "flexflow/utils/random_utils.h"
 #include "flexflow/utils/test_utils.h"
@@ -4684,6 +4686,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+#ifdef FF_BUILD_INFERENCE
   // RequestManager load_tokens
   {
     TaskVariantRegistrar registrar(RM_LOAD_TOKENS_TASK_ID,
@@ -4837,6 +4840,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+#endif
   // ElementUnary task
   {
     TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID,

From ca3dabf7d23cf2173fca830249c4cb9eeb6171bf Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sat, 5 Oct 2024 11:36:34 -0700
Subject: [PATCH 28/44] [AllReduce] make AllReduce tasks concurrent in FlexFlow
 (#1517)

* minor bug fix

* make AllReduce tasks concurrent

* set concurrent=true for remaining operators

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
---
 src/ops/fused.cc                      |  6 ++++++
 src/ops/lora_linear.cc                |  2 ++
 src/parallel_ops/allreduce.cc         |  5 +++++
 src/parallel_ops/parallel_identity.cc |  4 ++++
 src/runtime/model.cc                  | 23 +++++++++++++++++++++++
 5 files changed, 40 insertions(+)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 121139beb1..720d678a4a 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -476,6 +476,7 @@ void FusedOp::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -570,6 +571,7 @@ void FusedOp::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -604,6 +606,7 @@ void FusedOp::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
@@ -659,6 +662,7 @@ FutureMap FusedOp::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
@@ -735,6 +739,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
@@ -787,6 +792,7 @@ void FusedOp::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int idx = 0;
   for (int i = 0; i < numInputs; i++) {
     launcher.add_region_requirement(RegionRequirement(inputs[i]->part,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index fde6bc2b28..513147f3b7 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -296,6 +296,7 @@ void LoraLinear::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -795,6 +796,7 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 52c4ec2e28..dc43d80133 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -131,6 +131,7 @@ void AllReduce::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -164,6 +165,7 @@ void AllReduce::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -212,6 +214,7 @@ void AllReduce::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
@@ -265,6 +268,7 @@ void AllReduce::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -306,6 +310,7 @@ FutureMap AllReduce::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc
index 883910ae09..7d68036709 100644
--- a/src/parallel_ops/parallel_identity.cc
+++ b/src/parallel_ops/parallel_identity.cc
@@ -133,6 +133,7 @@ void ParallelIdentity::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -214,6 +215,7 @@ void ParallelIdentity::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
@@ -268,6 +270,7 @@ void ParallelIdentity::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -381,6 +384,7 @@ FutureMap
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 5213633e73..52f1dd2220 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6888,6 +6888,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, LoraLinear::init_task>(
           registrar, "LoraLinear Init Task");
@@ -6919,6 +6920,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "LoraLinear PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<LoraLinear::peft_bwd_task>(
           registrar, "LoraLinear PEFT Backward Task");
@@ -6950,6 +6952,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, FusedOp::init_task>(
           registrar, "FusedOp Init Task");
@@ -6964,6 +6967,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::inference_task>(
           registrar, "FusedOp Inference Task");
@@ -6979,6 +6983,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "FusedOp PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::peft_bwd_task>(
           registrar, "FusedOp PEFT Backward Task");
@@ -6994,6 +6999,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::forward_task>(
           registrar, "FusedOp Forward Task");
@@ -7008,6 +7014,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::backward_task>(
           registrar, "FusedOp Backward Task");
@@ -7244,6 +7251,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, AllReduce::init_task>(
           registrar, "AllReduce init Task");
@@ -7258,6 +7266,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::forward_task>(
           registrar, "AllReduce Forward Task");
@@ -7272,6 +7283,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    // registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::backward_task>(
           registrar, "AllReduce Backward Task");
@@ -7287,6 +7301,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::inference_task>(
           registrar, "AllReduce Inference Task");
@@ -7302,6 +7319,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrently since they
+    // use ncclAllReduce internally
+    // registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::peft_bwd_task>(
           registrar, "AllReduce PEFT Backward Task");
@@ -7318,6 +7338,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, ParallelIdentity::init_task>(
           registrar, "ParallelIdentity init Task");
@@ -7349,6 +7370,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ParallelIdentity::backward_task>(
           registrar, "ParallelIdentity Backward Task");
@@ -7381,6 +7403,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ParallelIdentity::peft_bwd_task>(
           registrar, "ParallelIdentity PEFT Backward Task");

From 96628b360efb6a0299dd9a3a652a91249b722231 Mon Sep 17 00:00:00 2001
From: Yingcheng <135535812+yingchen21@users.noreply.github.com>
Date: Thu, 10 Oct 2024 06:27:49 +0800
Subject: [PATCH 29/44] Attention projections (QKV, O) disaggregation (#1436)

* merged attn-qkv-proj into peft.
commented out some alignment test, but should be equivalent to the oriinal test.

* restored and passed the alignement test

* linting

* rebased onto inference

* Bug fixes, uploaded missing cpp implmentation

* Code cleanup

* clean up

* fixed problem with mpt.

* update

* llama3.1 support

* fix

* support llama3.2

* fix opt bias?

* opt alignment test stub

* fix bias

* update

* fix non-fusion opt

* update

* fix

* cleanup

* delete file

* cleanup

* shellcheck

* hip cleanup

* fix

* hip fixes

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
Co-authored-by: zhihao <email>
Co-authored-by: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
---
 .gitignore                                    |    3 +
 .../ops/inc_multihead_self_attention.py       |    6 -
 .../inc_multihead_self_attention_verify.py    |    6 -
 .../ops/inc_multiquery_self_attention.py      |    6 -
 .../inc_multiquery_self_attention_verify.py   |    6 -
 .../ops/spec_inc_multihead_self_attention.py  |    6 -
 .../ops/spec_inc_multiquery_self_attention.py |    6 -
 include/flexflow/flexflow_c.h                 |   48 +-
 include/flexflow/inference.h                  |   39 +-
 include/flexflow/layer.h                      |    3 +
 include/flexflow/model.h                      |  146 +-
 include/flexflow/operator.h                   |    8 +-
 .../ops/inc_multihead_self_attention.h        |   54 +-
 .../ops/inc_multihead_self_attention_params.h |    5 +-
 .../inc_multihead_self_attention_kernels.h    |   49 +-
 .../ops/spec_inc_multihead_self_attention.h   |   25 +-
 ...spec_inc_multihead_self_attention_params.h |    4 +-
 .../ops/tree_inc_multihead_self_attention.h   |   26 +-
 ...tree_inc_multihead_self_attention_params.h |    4 +-
 inference/models/falcon.cc                    |   81 +-
 inference/models/falcon.h                     |   29 +-
 inference/models/llama.cc                     |   72 +-
 inference/models/llama.h                      |   29 +-
 inference/models/mpt.cc                       |   54 +-
 inference/models/mpt.h                        |    2 +
 inference/models/opt.cc                       |   62 +-
 inference/models/opt.h                        |    9 +-
 inference/models/starcoder.cc                 |   55 +-
 inference/models/starcoder.h                  |    4 +-
 inference/python/incr_decoding.py             |   10 +-
 python/flexflow/core/flexflow_cffi.py         |  161 +-
 python/flexflow/serve/models/falcon.py        |   56 +-
 python/flexflow/serve/models/llama.py         |   56 +-
 python/flexflow/serve/models/mpt.py           |   46 +-
 python/flexflow/serve/models/opt.py           |   45 +-
 python/flexflow/serve/models/starcoder.py     |   32 +-
 src/c/flexflow_c.cc                           |  114 +-
 src/ops/add_bias_residual_layer_norm.cc       |   14 +-
 src/ops/fused.cpp                             |   48 +-
 src/ops/fused.cu                              |   55 +-
 src/ops/inc_multihead_self_attention.cc       |  496 +--
 src/ops/inc_multihead_self_attention.cpp      | 1646 ++++-----
 src/ops/inc_multihead_self_attention.cu       | 2972 ++++++++---------
 src/ops/kernels/linear_kernels.cu             |    1 +
 src/ops/linear.cc                             |    6 +-
 src/ops/residual_layer_norm.cc                |   17 +-
 src/ops/spec_inc_multihead_self_attention.cc  |  415 +--
 src/ops/spec_inc_multihead_self_attention.cpp | 1056 +++---
 src/ops/spec_inc_multihead_self_attention.cu  |  101 +-
 src/ops/tree_inc_multihead_self_attention.cc  |  385 +--
 src/ops/tree_inc_multihead_self_attention.cpp |  411 +--
 src/ops/tree_inc_multihead_self_attention.cu  |  409 +--
 src/parallel_ops/allreduce.cc                 |    2 +-
 src/runtime/file_loader.cc                    |  406 ++-
 src/runtime/graph.cc                          |  107 +-
 src/runtime/inference_manager.cc              |    1 +
 src/runtime/layer.cc                          |   17 +
 src/runtime/model.cc                          |   51 +-
 src/runtime/operator.cc                       |   12 +
 src/runtime/substitution.cc                   |    5 +-
 tests/fine_grained_alignment_test.sh          |  106 +
 tests/inference/huggingface_inference.py      |   49 +-
 tests/inference/inference_alignment_test.py   |  817 +++++
 tests/peft/alignment/align_test_utils.py      |   13 +-
 tests/peft/hf_finetune.py                     |    2 +-
 tests/peft/hf_utils.py                        |   15 +-
 tests/peft/peft_alignment_test.py             |   39 +-
 67 files changed, 5146 insertions(+), 5895 deletions(-)
 create mode 100755 tests/fine_grained_alignment_test.sh
 create mode 100644 tests/inference/inference_alignment_test.py

diff --git a/.gitignore b/.gitignore
index cc34c1a7b6..c1e22fcaba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -193,3 +193,6 @@ lora_training_logs
 Untitled-1.ipynb
 Untitled-2.ipynb
 tests/inference/python_test_configs/*.json
+
+core.*
+fine_grained_alignment_config.json
diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py
index dce7bd565d..ab80a5893c 100644
--- a/examples/python/native/ops/inc_multihead_self_attention.py
+++ b/examples/python/native/ops/inc_multihead_self_attention.py
@@ -11,8 +11,6 @@ def test_inc_multihead_self_attention(
         kdim: int = 0,
         vdim: int = 0,
         dropout: float = 0.0,
-        bias: bool = True,
-        add_bias_kv: bool = False,
         add_zero_attn: bool = False,
         data_type: DataType = DataType.DT_NONE,
         kernel_initializer=None,
@@ -34,8 +32,6 @@ def test_inc_multihead_self_attention(
         kdim=kdim,
         vdim=vdim,
         dropout=dropout,
-        bias=bias,
-        add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         data_type=data_type,
         kernel_initializer=kernel_initializer,
@@ -85,8 +81,6 @@ def test_inc_multihead_self_attention(
         kdim=0,  # Example value for kdim
         vdim=0,  # Example value for vdim
         dropout=0.1,  # Example value for dropout
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_FLOAT,
         kernel_initializer=None,  # Example value for kernel_initializer
diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py
index f6dc8e3933..bc2ba5e977 100644
--- a/examples/python/native/ops/inc_multihead_self_attention_verify.py
+++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py
@@ -11,8 +11,6 @@ def test_inc_multihead_self_attention_verify(
         kdim: int = 0,
         vdim: int = 0,
         dropout: float = 0.0,
-        bias: bool = True,
-        add_bias_kv: bool = False,
         add_zero_attn: bool = False,
         data_type: DataType = DataType.DT_NONE,
         kernel_initializer=None,
@@ -34,8 +32,6 @@ def test_inc_multihead_self_attention_verify(
         kdim=kdim,
         vdim=vdim,
         dropout=dropout,
-        bias=bias,
-        add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         data_type=data_type,
         kernel_initializer=kernel_initializer,
@@ -85,8 +81,6 @@ def test_inc_multihead_self_attention_verify(
         kdim=0,  # Example value for kdim
         vdim=0,  # Example value for vdim
         dropout=0.1,  # Example value for dropout
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_FLOAT,
         kernel_initializer=None,  # Example value for kernel_initializer
diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py
index 33390ab1f6..424b46b0f4 100644
--- a/examples/python/native/ops/inc_multiquery_self_attention.py
+++ b/examples/python/native/ops/inc_multiquery_self_attention.py
@@ -12,8 +12,6 @@ def test_inc_multiquery_self_attention(
         kdim: int = 0,
         vdim: int = 0,
         dropout: float = 0.0,
-        bias: bool = True,
-        add_bias_kv: bool = False,
         add_zero_attn: bool = False,
         data_type: DataType = DataType.DT_NONE,
         kernel_initializer=None,
@@ -36,8 +34,6 @@ def test_inc_multiquery_self_attention(
         kdim=kdim,
         vdim=vdim,
         dropout=dropout,
-        bias=bias,
-        add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         data_type=data_type,
         kernel_initializer=kernel_initializer,
@@ -89,8 +85,6 @@ def test_inc_multiquery_self_attention(
         kdim=0,  # Example value for kdim
         vdim=0,  # Example value for vdim
         dropout=0.1,  # Example value for dropout
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_FLOAT,
         kernel_initializer=None,  # Example value for kernel_initializer
diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
index 69a76f68bf..b2c0e7dcf5 100644
--- a/examples/python/native/ops/inc_multiquery_self_attention_verify.py
+++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
@@ -12,8 +12,6 @@ def test_inc_multiquery_self_attention_verify(
         kdim: int = 0,
         vdim: int = 0,
         dropout: float = 0.0,
-        bias: bool = True,
-        add_bias_kv: bool = False,
         add_zero_attn: bool = False,
         data_type: DataType = DataType.DT_NONE,
         kernel_initializer=None,
@@ -36,8 +34,6 @@ def test_inc_multiquery_self_attention_verify(
         kdim=kdim,
         vdim=vdim,
         dropout=dropout,
-        bias=bias,
-        add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         data_type=data_type,
         kernel_initializer=kernel_initializer,
@@ -89,8 +85,6 @@ def test_inc_multiquery_self_attention_verify(
         kdim=0,  # Example value for kdim
         vdim=0,  # Example value for vdim
         dropout=0.1,  # Example value for dropout
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_FLOAT,
         kernel_initializer=None,  # Example value for kernel_initializer
diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py
index bd1aaa189b..d0fa5f7689 100644
--- a/examples/python/native/ops/spec_inc_multihead_self_attention.py
+++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py
@@ -11,8 +11,6 @@ def test_spec_inc_multihead_self_attention(
         kdim: int = 0,
         vdim: int = 0,
         dropout: float = 0.0,
-        bias: bool = True,
-        add_bias_kv: bool = False,
         add_zero_attn: bool = False,
         data_type: DataType = DataType.DT_NONE,
         kernel_initializer=None,
@@ -34,8 +32,6 @@ def test_spec_inc_multihead_self_attention(
         kdim=kdim,
         vdim=vdim,
         dropout=dropout,
-        bias=bias,
-        add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         data_type=data_type,
         kernel_initializer=kernel_initializer,
@@ -85,8 +81,6 @@ def test_spec_inc_multihead_self_attention(
         kdim=0,  # Example value for kdim
         vdim=0,  # Example value for vdim
         dropout=0.1,  # Example value for dropout
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_FLOAT,
         kernel_initializer=None,  # Example value for kernel_initializer
diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
index 0b731c99e0..0d04f639c9 100644
--- a/examples/python/native/ops/spec_inc_multiquery_self_attention.py
+++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
@@ -12,8 +12,6 @@ def test_spec_inc_multiquery_self_attention(
         kdim: int = 0,
         vdim: int = 0,
         dropout: float = 0.0,
-        bias: bool = True,
-        add_bias_kv: bool = False,
         add_zero_attn: bool = False,
         data_type: DataType = DataType.DT_NONE,
         kernel_initializer=None,
@@ -36,8 +34,6 @@ def test_spec_inc_multiquery_self_attention(
         kdim=kdim,
         vdim=vdim,
         dropout=dropout,
-        bias=bias,
-        add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         data_type=data_type,
         kernel_initializer=kernel_initializer,
@@ -89,8 +85,6 @@ def test_spec_inc_multiquery_self_attention(
         kdim=0,  # Example value for kdim
         vdim=0,  # Example value for vdim
         dropout=0.1,  # Example value for dropout
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_FLOAT,
         kernel_initializer=None,  # Example value for kernel_initializer
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 52b4b3d362..c1e18e660b 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -445,12 +445,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -465,12 +469,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -485,12 +493,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -506,12 +518,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -527,12 +543,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -548,12 +568,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index ba4101c173..755df9f5cb 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -43,8 +43,43 @@ struct GenerationResult {
   std::vector<float> finetuning_losses;
 };
 
-#include <string>
-#include <vector>
+struct RotaryEmbeddingMeta {
+  bool apply_rotary_embedding = false;
+  float rope_theta = 10000.0f;
+  std::string rope_type = "default";
+  float factor = 8.0f;
+  float low_freq_factor = 1.0f;
+  float high_freq_factor = 4.0f;
+  int original_max_position_embeddings = 8192;
+
+  RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
+                      float rope_theta_ = 10000.0f,
+                      std::string rope_type_ = "default",
+                      float factor_ = 8.0f,
+                      float low_freq_factor_ = 1.0f,
+                      float high_freq_factor_ = 4.0f,
+                      int original_max_position_embeddings_ = 8192)
+      : apply_rotary_embedding(apply_rotary_embedding_),
+        rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
+        low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
+        original_max_position_embeddings(original_max_position_embeddings_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  RotaryEmbeddingMeta const &meta) {
+    os << std::boolalpha // To print bool as true/false instead of 1/0
+       << "RotaryEmbeddingMeta {\n"
+       << "  apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
+       << "  rope_theta: " << meta.rope_theta << ",\n"
+       << "  rope_type: \"" << meta.rope_type << "\",\n"
+       << "  factor: " << meta.factor << ",\n"
+       << "  low_freq_factor: " << meta.low_freq_factor << ",\n"
+       << "  high_freq_factor: " << meta.high_freq_factor << ",\n"
+       << "  original_max_position_embeddings: "
+       << meta.original_max_position_embeddings << "\n"
+       << "}";
+    return os;
+  }
+};
 
 std::string join_path(std::vector<std::string> const &paths);
 
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index c3dbcac422..e18bad3982 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -32,11 +32,13 @@ class Layer {
   void add_float_property(std::string const &key, float value);
   void add_int_vector_property(std::string const &key,
                                std::vector<int> const &value);
+  void add_string_property(std::string const &key, std::string const &value);
   void add_initializer(std::string const &key, Initializer *initializer);
   bool get_int_property(std::string const &key, long long &value) const;
   bool get_float_property(std::string const &key, float &value) const;
   bool get_int_vector_property(std::string const &key,
                                std::vector<int> &value) const;
+  bool get_string_property(std::string const &key, std::string &value) const;
   bool get_initializer(std::string const &key, Initializer *&initializer) const;
   Tensor get_parameter(int index);
   void print();
@@ -59,6 +61,7 @@ class Layer {
   std::unordered_map<std::string, float> float_properties;
   std::unordered_map<std::string, Initializer *> initializers;
   std::unordered_map<std::string, std::vector<int>> int_vector_properties;
+  std::unordered_map<std::string, std::string> string_properties;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 4ad735ef7d..51b7950db8 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -733,41 +733,38 @@ class FFModel {
                              DataType data_type = DT_NONE,
                              Initializer *kernel_initializer = NULL,
                              char const *name = NULL);
-  Tensor inc_multihead_self_attention(const Tensor input,
-                                      int embed_dim,
-                                      int num_heads,
-                                      int kdim = 0,
-                                      int vdim = 0,
-                                      float dropout = 0.0f,
-                                      bool bias = false,
-                                      bool add_bias_kv = false,
-                                      bool add_zero_attn = false,
-                                      DataType data_type = DT_NONE,
-                                      Initializer *kernel_initializer = NULL,
-                                      bool apply_rotary_embedding = false,
-                                      bool scaling_query = false,
-                                      float scaling_factor = 1.0f,
-                                      bool qk_prod_scaling = true,
-                                      bool position_bias = false,
-                                      char const *name = NULL);
-  Tensor
-      spec_inc_multihead_self_attention(const Tensor input,
-                                        int embed_dim,
-                                        int num_heads,
-                                        int kdim = 0,
-                                        int vdim = 0,
-                                        float dropout = 0.0f,
-                                        bool bias = false,
-                                        bool add_bias_kv = false,
-                                        bool add_zero_attn = false,
-                                        DataType data_type = DT_NONE,
-                                        Initializer *kernel_initializer = NULL,
-                                        bool apply_rotary_embedding = false,
-                                        bool scaling_query = false,
-                                        float scaling_factor = 1.0f,
-                                        bool qk_prod_scaling = true,
-                                        bool position_bias = false,
-                                        char const *name = NULL);
+  Tensor inc_multihead_self_attention(
+      const Tensor input,
+      int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      char const *name = NULL);
+  Tensor spec_inc_multihead_self_attention(
+      const Tensor input,
+      int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      char const *name = NULL);
   Tensor inc_multihead_self_attention_verify(
       const Tensor input,
       int embed_dim,
@@ -775,54 +772,49 @@ class FFModel {
       int kdim = 0,
       int vdim = 0,
       float dropout = 0.0f,
-      bool bias = false,
-      bool add_bias_kv = false,
       bool add_zero_attn = false,
       DataType data_type = DT_NONE,
       Initializer *kernel_initializer = NULL,
-      bool apply_rotary_embedding = false,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      char const *name = NULL);
+  Tensor inc_multiquery_self_attention(
+      const Tensor input,
+      int embed_dim,
+      int num_q_heads,
+      int num_kv_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      char const *name = NULL);
+  Tensor spec_inc_multiquery_self_attention(
+      const Tensor input,
+      int embed_dim,
+      int num_q_heads,
+      int num_kv_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
       bool scaling_query = false,
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
       bool position_bias = false,
       char const *name = NULL);
-  Tensor inc_multiquery_self_attention(const Tensor input,
-                                       int embed_dim,
-                                       int num_q_heads,
-                                       int num_kv_heads,
-                                       int kdim = 0,
-                                       int vdim = 0,
-                                       float dropout = 0.0f,
-                                       bool bias = false,
-                                       bool add_bias_kv = false,
-                                       bool add_zero_attn = false,
-                                       DataType data_type = DT_NONE,
-                                       Initializer *kernel_initializer = NULL,
-                                       bool apply_rotary_embedding = false,
-                                       bool scaling_query = false,
-                                       float scaling_factor = 1.0f,
-                                       bool qk_prod_scaling = true,
-                                       bool position_bias = false,
-                                       char const *name = NULL);
-  Tensor
-      spec_inc_multiquery_self_attention(const Tensor input,
-                                         int embed_dim,
-                                         int num_q_heads,
-                                         int num_kv_heads,
-                                         int kdim = 0,
-                                         int vdim = 0,
-                                         float dropout = 0.0f,
-                                         bool bias = false,
-                                         bool add_bias_kv = false,
-                                         bool add_zero_attn = false,
-                                         DataType data_type = DT_NONE,
-                                         Initializer *kernel_initializer = NULL,
-                                         bool apply_rotary_embedding = false,
-                                         bool scaling_query = false,
-                                         float scaling_factor = 1.0f,
-                                         bool qk_prod_scaling = true,
-                                         bool position_bias = false,
-                                         char const *name = NULL);
   Tensor inc_multiquery_self_attention_verify(
       const Tensor input,
       int embed_dim,
@@ -831,12 +823,10 @@ class FFModel {
       int kdim = 0,
       int vdim = 0,
       float dropout = 0.0f,
-      bool bias = false,
-      bool add_bias_kv = false,
       bool add_zero_attn = false,
       DataType data_type = DT_NONE,
       Initializer *kernel_initializer = NULL,
-      bool apply_rotary_embedding = false,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
       bool scaling_query = false,
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 1a5af67b36..007314797a 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -335,7 +335,13 @@ class Op {
     // only dump the weights in the forward pass, at the first step
     // note that we do not save the weight gradients, since we only support
     // finetuning LoRA weights, which are not FF tensors.
-    if (fwd_pass && m->decoding_step == 0) {
+    // Set FF_DEBG_NO_WEIGHTS=1 or to FF_DEBG_NO_WEIGHTS=true to disable saving
+    // weights
+    bool do_not_save_weights =
+        (std::getenv("FF_DEBG_NO_WEIGHTS") &&
+         (std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "1" ||
+          std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "true"));
+    if (fwd_pass && m->decoding_step == 0 && !do_not_save_weights) {
       fs::path dst_filepath_weights =
           get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
           layername;
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index f77df7c456..4519cf8215 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -36,49 +36,40 @@ class IncMultiHeadSelfAttention : public Op {
                             int _kdim,
                             int _vdim,
                             float _dropout,
-                            bool _qkv_bias,
-                            bool _final_bias,
                             bool _add_zero_attn,
-                            bool _apply_rotary_embedding,
+                            RotaryEmbeddingMeta _rotary_embedding_meta,
                             bool _scaling_query,
                             float _scaling_factor,
                             bool _qk_prod_scaling,
                             bool _position_bias,
-                            bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             ParallelTensor const _input,
-                            ParallelTensor const _weight,
                             int _embed_dim,
                             int _num_q_heads,
                             int _num_kv_heads,
                             int _kdim,
                             int _vdim,
                             float _dropout,
-                            bool _qkv_bias,
-                            bool _final_bias,
                             bool _add_zero_attn,
-                            bool _apply_rotary_embedding,
+                            RotaryEmbeddingMeta _rotary_embedding_meta,
                             bool _scaling_query,
                             float _scaling_factor,
                             bool _qk_prod_scaling,
                             bool _position_bias,
-                            bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             IncMultiHeadSelfAttention const &other,
-                            ParallelTensor const input,
-                            bool allocate_weights);
+                            ParallelTensor const input);
   IncMultiHeadSelfAttention(FFModel &model,
                             Params const &params,
                             Input const &inputs,
-                            bool allocate_weights = false,
                             char const *name = nullptr);
   static Op *
       create_operator_from_layer(FFModel &model,
@@ -125,24 +116,20 @@ class IncMultiHeadSelfAttention : public Op {
                                        BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
-                                       GenericTensorAccessorR const &weight,
-                                       GenericTensorAccessorW const &output,
-                                       GenericTensorAccessorR const &bias);
-  static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
-                                      BatchConfig const *bc,
-                                      int shard_id,
-                                      GenericTensorAccessorW const &input_grad,
-                                      GenericTensorAccessorR const &weight,
-                                      GenericTensorAccessorR const &output_grad,
-                                      GenericTensorAccessorR const &bias);
+                                       GenericTensorAccessorW const &output);
+  static void
+      peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
+                              BatchConfig const *bc,
+                              int shard_id,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorR const &output_grad);
   Params get_params() const;
 
 public:
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
+  bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
@@ -153,7 +140,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
 public:
   IncMultiHeadSelfAttentionMeta(FFHandler handler,
                                 IncMultiHeadSelfAttention const *attn,
-                                GenericTensorAccessorR const &weight,
                                 MemoryAllocator &gpu_mem_allocator,
                                 int num_samples,
                                 int _num_q_heads,
@@ -168,14 +154,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 int _kProjSize,
                                 int _vProjSize,
                                 int _oProjSize,
-                                bool _apply_rotary_embedding,
-                                bool _qkv_bias,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
-                                bool _final_bias,
                                 float _scaling_factor,
-                                GenericTensorAccessorR const &weight,
                                 MemoryAllocator &gpu_mem_allocator,
                                 int num_samples,
                                 int _global_num_q_heads,
@@ -188,30 +171,23 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
 
 public:
   Realm::RegionInstance reserveInst;
-  size_t weights_params, weightSize, biasSize, reserveSpaceSize,
-      quantized_weightSize;
+  size_t reserveSpaceSize;
   int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
   int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
       hidden_size;
-  bool *has_load_weights;
-  bool *apply_rotary_embedding;
-  bool *qkv_bias;
-  bool *final_bias;
+  RotaryEmbeddingMeta *rotary_embedding_meta;
   bool *scaling_query;
   bool *qk_prod_scaling;
   bool *position_bias;
   float scaling_factor;
-  void *weight_ptr, *bias_ptr; // for weight offload
   void *devQKVProjArray, *keyCache, *valueCache;
   void *qk_prods, *qk_prods_softmax;
   void *attn_heads;
-  char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
   BatchConfig::PerRequestInfo *request_infos;
   DataType quantization_type;
   bool offload;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-  // cudaStream_t task_local_stream;
   cudnnTensorDescriptor_t qk_tensor;
   cuFloatComplex *complex_input;
 #elif defined(FF_USE_HIP_ROCM)
diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
index 58681069e2..9b0a26e5d7 100644
--- a/include/flexflow/ops/inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -3,6 +3,7 @@
 
 #include "flexflow/ffconst.h"
 #include "flexflow/fftype.h"
+#include "flexflow/inference.h"
 #include "flexflow/parallel_tensor.h"
 
 namespace FlexFlow {
@@ -12,8 +13,8 @@ struct IncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
       tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   DataType quantization_type;
   bool offload;
   char name[MAX_OPNAME];
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 26dcf12425..16d5915381 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -14,6 +14,11 @@ namespace FlexFlow {
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+template <typename DT>
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
+                                     BatchConfig const *bc,
+                                     int shard_id,
+                                     ffStream_t stream);
 template <typename DT>
 void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
                                          BatchConfig const *bc,
@@ -21,14 +26,11 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
                                          ffStream_t stream);
 
 template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         ffStream_t stream);
+void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        int shard_id,
+                        DT *output_ptr,
+                        ffStream_t stream);
 
 template <typename DT>
 __global__ void apply_position_bias_qkprd(DT *input_ptr,
@@ -38,27 +40,6 @@ __global__ void apply_position_bias_qkprd(DT *input_ptr,
                                           int global_num_q_heads,
                                           int shard_id);
 
-template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize);
-
-template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int num_heads,
-                                    int num_kv_heads,
-                                    bool scaling_query,
-                                    float scaling_factor,
-                                    int hidden_size);
-
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 template <typename DT>
 __global__ void
@@ -91,16 +72,6 @@ __global__ void
                            bool q_tensor);
 #endif
 
-template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        ffStream_t stream);
-
 template <typename DT>
 void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index a0d01092bf..155132a7fe 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -33,43 +33,34 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 int _kdim,
                                 int _vdim,
                                 float _dropout,
-                                bool _qkv_bias,
-                                bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
-                                bool allocate_weights,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 const ParallelTensor _input,
-                                const ParallelTensor _weight,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
                                 int _kdim,
                                 int _vdim,
                                 float _dropout,
-                                bool _qkv_bias,
-                                bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
-                                bool allocate_weights,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 SpecIncMultiHeadSelfAttention const &other,
-                                const ParallelTensor input,
-                                bool allocate_weights);
+                                const ParallelTensor input);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 Params const &params,
                                 Input const &inputs,
-                                bool allocate_weights = false,
                                 char const *name = nullptr);
   static Op *
       create_operator_from_layer(FFModel &model,
@@ -112,17 +103,14 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                BeamSearchBatchConfig const *bc,
                                int shard_id,
                                GenericTensorAccessorR const &input,
-                               GenericTensorAccessorR const &weight,
-                               GenericTensorAccessorW const &output,
-                               GenericTensorAccessorR const &bias);
+                               GenericTensorAccessorW const &output);
   Params get_params() const;
 
 public:
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
+  bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
 };
@@ -131,7 +119,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 public:
   SpecIncMultiHeadSelfAttentionMeta(FFHandler handler,
                                     SpecIncMultiHeadSelfAttention const *attn,
-                                    GenericTensorAccessorR const &weight,
                                     MemoryAllocator &gpu_mem_allocator,
                                     int num_samples,
                                     int _num_q_heads,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index 1461224ba9..a0ae3fc4f2 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -11,8 +11,8 @@ struct SpecIncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
   int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 168ad5f618..9755e62d42 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -33,49 +33,40 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 int _kdim,
                                 int _vdim,
                                 float _dropout,
-                                bool _qkv_bias,
-                                bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
-                                bool allocate_weights,
                                 DataType _quantization_type,
                                 bool _offload,
                                 int _tensor_parallelism_degree,
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 const ParallelTensor _input,
-                                const ParallelTensor _weight,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
                                 int _kdim,
                                 int _vdim,
                                 float _dropout,
-                                bool _qkv_bias,
-                                bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
-                                bool allocate_weights,
                                 DataType _quantization_type,
                                 bool _offload,
                                 int _tensor_parallelism_degree,
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 TreeIncMultiHeadSelfAttention const &other,
-                                const ParallelTensor input,
-                                bool allocate_weights);
+                                const ParallelTensor input);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 Params const &params,
                                 Input const &inputs,
-                                bool allocate_weights = false,
                                 char const *name = nullptr);
   static Op *
       create_operator_from_layer(FFModel &model,
@@ -114,18 +105,14 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                        TreeVerifyBatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
-                                       GenericTensorAccessorR const &weight,
-                                       GenericTensorAccessorW const &output,
-                                       GenericTensorAccessorR const &bias);
-
+                                       GenericTensorAccessorW const &output);
   Params get_params() const;
 
 public:
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
+  bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
@@ -136,7 +123,6 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 public:
   TreeIncMultiHeadSelfAttentionMeta(FFHandler handler,
                                     TreeIncMultiHeadSelfAttention const *attn,
-                                    GenericTensorAccessorR const &weight,
                                     MemoryAllocator &gpu_mem_allocator,
                                     int num_samples,
                                     int _num_q_heads,
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
index d1a51b8b8f..b49db2c10d 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
@@ -12,8 +12,8 @@ struct TreeIncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
       tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   DataType quantization_type;
   bool offload;
   char name[MAX_OPNAME];
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 195d6ba7e3..fd4da87b99 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -60,6 +60,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                               "word_embeddings");
 
   Tensor mha = nullptr, mlp_output = nullptr;
+  Tensor qkv_proj = nullptr, o_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
   for (int i = 0; i < falcon_config.n_layer; i++) {
@@ -97,26 +98,41 @@ void FALCON::create_falcon_model(FFModel &ff,
       att_norm = res_ln_outputs[1];
     }
 
+    qkv_proj = ff.dense(
+        att_norm,
+        falcon_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attention.qkv_proj")
+            .c_str());
+    qkv_proj->print("qkv_proj");
+
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multiquery_self_attention(
-            att_norm,
+        o_proj = ff.spec_inc_multiquery_self_attention(
+            qkv_proj,
             falcon_config.hidden_size,
             falcon_config.n_head,
             falcon_config.n_head_kv,
             falcon_config.hidden_size / falcon_config.n_head,
             falcon_config.hidden_size / falcon_config.n_head,
             0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            falcon_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
@@ -124,24 +140,22 @@ void FALCON::create_falcon_model(FFModel &ff,
       }
 
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multiquery_self_attention_verify(
-            att_norm,
+        o_proj = ff.inc_multiquery_self_attention_verify(
+            qkv_proj,
             falcon_config.hidden_size,
             falcon_config.n_head,
             falcon_config.n_head_kv,
             falcon_config.hidden_size / falcon_config.n_head,
             falcon_config.hidden_size / falcon_config.n_head,
             0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            falcon_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
@@ -149,24 +163,22 @@ void FALCON::create_falcon_model(FFModel &ff,
       }
 
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
-            att_norm,
+        o_proj = ff.inc_multiquery_self_attention(
+            qkv_proj,
             falcon_config.hidden_size,
             falcon_config.n_head,
             falcon_config.n_head_kv,
             falcon_config.hidden_size / falcon_config.n_head,
             falcon_config.hidden_size / falcon_config.n_head,
             0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            falcon_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
@@ -177,6 +189,21 @@ void FALCON::create_falcon_model(FFModel &ff,
       }
     }
 
+    mha = ff.dense(
+        o_proj,
+        falcon_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attention.o_proj")
+            .c_str());
+    mha->print("mha");
+
     Tensor dense_h_to_4h = ff.dense(
         att_norm,
         falcon_config.hidden_size * 4,
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index fce2dade3f..565d7e5419 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -50,6 +50,26 @@ class FALCON {
                         : model_config["num_hidden_layers"];
           parallel_attn = model_config["parallel_attn"];
           vocab_size = model_config["vocab_size"];
+          rotary_embedding_meta.apply_rotary_embedding = true;
+          if (model_config.find("rope_theta") != model_config.end()) {
+            rotary_embedding_meta.rope_theta = model_config["rope_theta"];
+          } else {
+            rotary_embedding_meta.rope_theta = 10000.0f;
+          }
+          if (model_config.find("scaling_factor") != model_config.end() &&
+              !model_config["scaling_factor"].is_null()) {
+            rotary_embedding_meta.rope_type =
+                model_config["scaling_factor"]["rope_type"];
+            rotary_embedding_meta.factor =
+                model_config["scaling_factor"]["factor"];
+            rotary_embedding_meta.low_freq_factor =
+                model_config["scaling_factor"]["low_freq_factor"];
+            rotary_embedding_meta.high_freq_factor =
+                model_config["scaling_factor"]["high_freq_factor"];
+            rotary_embedding_meta.original_max_position_embeddings =
+                model_config["scaling_factor"]
+                            ["original_max_position_embeddings"];
+          }
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -59,8 +79,6 @@ class FALCON {
                   << std::endl;
         assert(false);
       }
-      // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
-      // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
       max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
       max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
     }
@@ -76,9 +94,8 @@ class FALCON {
       std::cout << "\tn_layer: " << n_layer << std::endl;
       std::cout << "\tparallel_attn: " << parallel_attn << std::endl;
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
-
-      // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
-      // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
       std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
     }
@@ -86,8 +103,8 @@ class FALCON {
     bool bias, multi_query, parallel_attn;
     int hidden_size, n_head, n_head_kv, n_layer, vocab_size;
     float layer_norm_epsilon;
-    // int max_seq_len, max_num_tokens;
     int max_beam_width, max_beam_depth;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_falcon_model(FFModel &ff,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index cf26194597..bd5243bd4b 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -91,28 +91,41 @@ void LLAMA::create_llama_model(FFModel &ff,
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
     }
+    Tensor qkv_proj = ff.dense(
+        att_norm,
+        llama_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like llama does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
+            .c_str());
 
     Tensor mha;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
         mha = ff.spec_inc_multiquery_self_attention(
-            att_norm,
+            qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            llama_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
@@ -120,23 +133,21 @@ void LLAMA::create_llama_model(FFModel &ff,
       }
       case TREE_VERIFY_MODE: {
         mha = ff.inc_multiquery_self_attention_verify(
-            att_norm,
+            qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            llama_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
@@ -144,23 +155,21 @@ void LLAMA::create_llama_model(FFModel &ff,
       }
       case INC_DECODING_MODE: {
         mha = ff.inc_multiquery_self_attention(
-            att_norm,
+            qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            llama_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
@@ -171,6 +180,21 @@ void LLAMA::create_llama_model(FFModel &ff,
       }
     }
 
+    Tensor mha_input = mha;
+    mha = ff.dense(
+        mha_input,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+            .c_str());
+
     // step 2: SILU activaion
     Tensor token_ff_norm[2] = {nullptr, nullptr};
     ff.residual_rms_norm(
diff --git a/inference/models/llama.h b/inference/models/llama.h
index edb78f1300..853a51a999 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -44,6 +44,26 @@ class LLAMA {
           hidden_size = model_config["hidden_size"];
           rms_norm_eps = model_config["rms_norm_eps"];
           intermediate_size = model_config["intermediate_size"];
+          rotary_embedding_meta.apply_rotary_embedding = true;
+          if (model_config.find("rope_theta") != model_config.end()) {
+            rotary_embedding_meta.rope_theta = model_config["rope_theta"];
+          } else {
+            rotary_embedding_meta.rope_theta = 10000.0f;
+          }
+          if (model_config.find("scaling_factor") != model_config.end() &&
+              !model_config["scaling_factor"].is_null()) {
+            rotary_embedding_meta.rope_type =
+                model_config["scaling_factor"]["rope_type"];
+            rotary_embedding_meta.factor =
+                model_config["scaling_factor"]["factor"];
+            rotary_embedding_meta.low_freq_factor =
+                model_config["scaling_factor"]["low_freq_factor"];
+            rotary_embedding_meta.high_freq_factor =
+                model_config["scaling_factor"]["high_freq_factor"];
+            rotary_embedding_meta.original_max_position_embeddings =
+                model_config["scaling_factor"]
+                            ["original_max_position_embeddings"];
+          }
         } catch (json::exception const &e) {
           std::cerr << "Error parsing LLAMA config from JSON file: " << e.what()
                     << std::endl;
@@ -54,8 +74,6 @@ class LLAMA {
                   << std::endl;
         assert(false);
       }
-      // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
-      // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
       max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
       max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
     }
@@ -71,18 +89,17 @@ class LLAMA {
       std::cout << "\thidden_size: " << hidden_size << std::endl;
       std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl;
       std::cout << "\tintermediate_size: " << intermediate_size << std::endl;
-
-      // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
-      // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
       std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
     }
 
-    // int max_seq_len, max_num_tokens;
     int max_beam_width, max_beam_depth;
     int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads,
         hidden_size, intermediate_size;
     float rms_norm_eps;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_llama_model(FFModel &ff,
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index e4a7e0056d..d02c0f3b82 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -93,22 +93,35 @@ void MPT::create_mpt_model(FFModel &ff,
       layernorm_output = res_ln_outputs[1];
     }
 
-    Tensor attn_outputs;
+    Tensor qkv_proj = ff.dense(
+        layernorm_output,
+        mpt_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".attn.qkv_proj").c_str());
+
+    Tensor o_proj;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        attn_outputs = ff.spec_inc_multihead_self_attention(
-            layernorm_output,
+        o_proj = ff.spec_inc_multihead_self_attention(
+            qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             0.0f,
             false,
-            false,
-            false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
@@ -120,19 +133,17 @@ void MPT::create_mpt_model(FFModel &ff,
         break;
       }
       case TREE_VERIFY_MODE: {
-        attn_outputs = ff.inc_multihead_self_attention_verify(
-            layernorm_output,
+        o_proj = ff.inc_multihead_self_attention_verify(
+            qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             0.0f,
             false,
-            false,
-            false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
@@ -144,19 +155,17 @@ void MPT::create_mpt_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        attn_outputs = ff.inc_multihead_self_attention(
-            layernorm_output,
+        o_proj = ff.inc_multihead_self_attention(
+            qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             0.0f,
             false,
-            false,
-            false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
@@ -172,6 +181,19 @@ void MPT::create_mpt_model(FFModel &ff,
       }
     }
 
+    Tensor attn_outputs = ff.dense(
+        o_proj,
+        mpt_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".attn.o_proj").c_str());
+
     ff.residual_layer_norm(
         attn_outputs,
         hidden_states,
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 08597e1d75..3001420ad0 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -37,6 +37,7 @@ class MPT {
           n_heads = model_config["n_heads"];
           n_layers = model_config["n_layers"];
           vocab_size = model_config["vocab_size"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -63,6 +64,7 @@ class MPT {
     // int max_seq_len, max_num_tokens;
     int max_beam_width, max_beam_depth;
     int hidden_size, n_heads, n_layers, vocab_size;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_mpt_model(FFModel &ff,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index b3f2ef4e17..34a6bb0f02 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -101,23 +101,37 @@ void OPT::create_opt_model(FFModel &ff,
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
 
-    Tensor mha;
+    Tensor qkv_proj = ff.dense(
+        hidden_states,
+        opt_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        true,          // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
+            .c_str());
+
+    Tensor o_proj;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multihead_self_attention(
-            hidden_states,
+        o_proj = ff.spec_inc_multihead_self_attention(
+            qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             0.0f,    /*dropout*/
-            true,    /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
@@ -128,20 +142,18 @@ void OPT::create_opt_model(FFModel &ff,
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multihead_self_attention_verify(
-            hidden_states,
+        o_proj = ff.inc_multihead_self_attention_verify(
+            qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             0.0f,    /*dropout*/
-            true,    /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
@@ -152,20 +164,18 @@ void OPT::create_opt_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multihead_self_attention(
-            hidden_states,
+        o_proj = ff.inc_multihead_self_attention(
+            qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             0.0f,    /*dropout*/
-            true,    /*qkv_bias*/
-            false,   /*final_bias*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
@@ -180,6 +190,20 @@ void OPT::create_opt_model(FFModel &ff,
       }
     }
 
+    Tensor mha = ff.dense(
+        o_proj,
+        opt_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+            .c_str());
+
     ff.add_bias_residual_layer_norm(mha,
                                     residual,
                                     res_ln_outputs,
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 7c736a26d1..8b85f81aa6 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -45,6 +45,7 @@ class OPT {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           word_embed_proj_dim = model_config["word_embed_proj_dim"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -54,8 +55,6 @@ class OPT {
                   << std::endl;
         assert(false);
       }
-      // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
-      // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
       max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
       max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
     }
@@ -78,9 +77,8 @@ class OPT {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim
                 << std::endl;
-
-      // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
-      // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
       std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
     }
@@ -91,6 +89,7 @@ class OPT {
     float dropout;
     int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads,
         num_hidden_layers, vocab_size, word_embed_proj_dim;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_opt_model(FFModel &ff,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index cd8bf3a9a7..2429b1ec1b 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -102,11 +102,28 @@ void STARCODER::create_starcoder_model(
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
+    Tensor qkv_proj = ff.dense(
+        ln_1,
+        startcoder_config.hidden_size *
+            3, // q, k, v. need to change if want to remove replication.
+               // (q_heads + 2 * kv_heads) * proj_size
+        AC_MODE_NONE,
+        false,         // seems like it does not use bias
+        DT_NONE,       // what is this
+        nullptr,       // ?
+        nullptr,       // ?
+        nullptr,       // ?
+        REG_MODE_NONE, // no regularization
+        0.0f,          // no dropout
+        std::string("layers." + std::to_string(i) + ".self_attention.qkv_proj")
+            .c_str());
+
     Tensor mha;
+    Tensor o_proj;
     switch (mode) {
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
-            ln_1,
+        o_proj = ff.inc_multiquery_self_attention(
+            qkv_proj,
             startcoder_config.hidden_size,
             startcoder_config.num_attention_heads,
             1,
@@ -114,17 +131,15 @@ void STARCODER::create_starcoder_model(
                 startcoder_config.num_attention_heads,
             startcoder_config.hidden_size /
                 startcoder_config.num_attention_heads,
-            startcoder_config.dropout_p, /*dropout*/
-            true,                        /*bias*/
-            false,                       /*add_bias_kv*/
-            false,                       /*add_zero_attn*/
-            DT_NONE,                     /*data_type*/
-            nullptr,                     /*kernel_initializer*/
-            false,                       /*apply_rotary_embedding*/
-            false,                       /*scaling query*/
-            1.0f,                        /*scaling factor*/
-            true,                        /*qk_prod_scaling*/
-            false,                       /*position_bias*/
+            startcoder_config.dropout_p,             /*dropout*/
+            false,                                   /*add_zero_attn*/
+            DT_NONE,                                 /*data_type*/
+            nullptr,                                 /*kernel_initializer*/
+            startcoder_config.rotary_embedding_meta, /*apply_rotary_embedding*/
+            false,                                   /*scaling query*/
+            1.0f,                                    /*scaling factor*/
+            true,                                    /*qk_prod_scaling*/
+            false,                                   /*position_bias*/
             std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
@@ -135,6 +150,20 @@ void STARCODER::create_starcoder_model(
       }
     }
 
+    mha = ff.dense(
+        o_proj,
+        startcoder_config.hidden_size,
+        AC_MODE_NONE,
+        true,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
+            .c_str());
+
     ff.residual_layer_norm(
         hidden_states,
         mha,
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index 0e9577d569..7ff6f33770 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -41,6 +41,7 @@ class STARCODER {
           intermediate_size = model_config["n_inner"];
           dropout_p = model_config["attn_pdrop"];
           max_position_embeddings = model_config["n_positions"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing STARCODER config from JSON file: "
                     << e.what() << std::endl;
@@ -51,8 +52,6 @@ class STARCODER {
                   << std::endl;
         assert(false);
       }
-      // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
-      // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
       max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
       max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
     }
@@ -64,6 +63,7 @@ class STARCODER {
     int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
         intermediate_size, max_position_embeddings;
     float layer_norm_epsilon, dropout_p;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_starcoder_model(FFModel &ff,
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index f888982f2c..1df5a05a8f 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -111,9 +111,15 @@ def main():
     
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
-        results = llm.generate(prompts)
+        if "max_length" not in configs_dict:
+            results = llm.generate(prompts)
+        else:
+            results = llm.generate(prompts, max_length=configs.max_length)
     else:
-        result = llm.generate("Three tips for staying healthy are: ")
+        if "max_length" not in configs_dict:
+            result = llm.generate("Three tips for staying healthy are: ")
+        else:
+            result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)
         
     llm.stop_server()
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 7692ccb88f..a5aadc270e 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -41,6 +41,7 @@
 from typing import Union, List
 from peft import LoraConfig
 import json
+from dataclasses import dataclass
 
 
 def ffc():
@@ -2070,6 +2071,22 @@ def __init__(
         self.max_training_steps = max_training_steps
 
 
+# -----------------------------------------------------------------------
+# RotaryEmbeddingMeta
+# -----------------------------------------------------------------------
+
+
+@dataclass
+class RotaryEmbeddingMeta:
+    apply_rotary_embedding: bool = False
+    rope_theta: float = 10000.0
+    rope_type: str = "default"
+    factor: float = 8.0
+    low_freq_factor: float = 1.0
+    high_freq_factor: float = 4.0
+    original_max_position_embeddings: int = 8192
+
+
 # -----------------------------------------------------------------------
 # FFModel
 # -----------------------------------------------------------------------
@@ -3509,12 +3526,10 @@ def inc_multihead_self_attention(
         kdim=0,
         vdim=0,
         dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3543,12 +3558,6 @@ def inc_multihead_self_attention(
         :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
         :type dropout: float(0-1)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
         :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
         :type add_zero_attn: bool
 
@@ -3558,8 +3567,8 @@ def inc_multihead_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3589,12 +3598,16 @@ def inc_multihead_self_attention(
             kdim,
             vdim,
             dropout,
-            bias,
-            add_bias_kv,
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3612,12 +3625,10 @@ def spec_inc_multihead_self_attention(
         kdim=0,
         vdim=0,
         dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3646,12 +3657,6 @@ def spec_inc_multihead_self_attention(
         :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
         :type dropout: float(0-1)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
         :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
         :type add_zero_attn: bool
 
@@ -3661,8 +3666,8 @@ def spec_inc_multihead_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3692,12 +3697,16 @@ def spec_inc_multihead_self_attention(
             kdim,
             vdim,
             dropout,
-            bias,
-            add_bias_kv,
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3715,12 +3724,10 @@ def inc_multihead_self_attention_verify(
         kdim=0,
         vdim=0,
         dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3749,12 +3756,6 @@ def inc_multihead_self_attention_verify(
         :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
         :type dropout: float(0-1)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
         :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
         :type add_zero_attn: bool
 
@@ -3764,8 +3765,8 @@ def inc_multihead_self_attention_verify(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3795,12 +3796,16 @@ def inc_multihead_self_attention_verify(
             kdim,
             vdim,
             dropout,
-            bias,
-            add_bias_kv,
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3819,12 +3824,10 @@ def inc_multiquery_self_attention(
         kdim=0,
         vdim=0,
         dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3856,12 +3859,6 @@ def inc_multiquery_self_attention(
         :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
         :type dropout: float(0-1)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
         :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
         :type add_zero_attn: bool
 
@@ -3871,8 +3868,8 @@ def inc_multiquery_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3903,12 +3900,16 @@ def inc_multiquery_self_attention(
             kdim,
             vdim,
             dropout,
-            bias,
-            add_bias_kv,
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3927,12 +3928,10 @@ def spec_inc_multiquery_self_attention(
         kdim=0,
         vdim=0,
         dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3964,12 +3963,6 @@ def spec_inc_multiquery_self_attention(
         :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
         :type dropout: float(0-1)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
         :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
         :type add_zero_attn: bool
 
@@ -3979,8 +3972,8 @@ def spec_inc_multiquery_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -4011,12 +4004,16 @@ def spec_inc_multiquery_self_attention(
             kdim,
             vdim,
             dropout,
-            bias,
-            add_bias_kv,
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -4035,12 +4032,10 @@ def inc_multiquery_self_attention_verify(
         kdim=0,
         vdim=0,
         dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -4072,12 +4067,6 @@ def inc_multiquery_self_attention_verify(
         :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
         :type dropout: float(0-1)
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
         :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
         :type add_zero_attn: bool
 
@@ -4087,8 +4076,8 @@ def inc_multiquery_self_attention_verify(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -4119,12 +4108,16 @@ def inc_multiquery_self_attention_verify(
             kdim,
             vdim,
             dropout,
-            bias,
-            add_bias_kv,
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 0e8fbcbd7d..0c6102406f 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -41,6 +41,17 @@ def __init__(self, hf_config):
         )
         self.parallel_attn = hf_config.parallel_attn
         self.vocab_size = hf_config.vocab_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(
+            apply_rotary_embedding=True,
+            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+        )
+        if "rope_scaling" in hf_config.__dict__:
+            if hf_config.rope_scaling is not None:
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
+                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = self.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -54,8 +65,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -63,11 +72,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.falcon_config = FalconConfig(hf_config)
-        # self.falcon_config.max_seq_length = max_seq_length
-        # self.falcon_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -138,60 +144,70 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.input_layernorm",
                 )
 
+            qkv_proj = ffmodel.dense(
+                att_norm,
+                3 * self.falcon_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.self_attention.qkv_proj",
+            )
+
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
-                mha = ffmodel.spec_inc_multiquery_self_attention(
-                    att_norm,
+                o_proj = ffmodel.spec_inc_multiquery_self_attention(
+                    qkv_proj,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
                     self.falcon_config.n_head_kv,
                     self.falcon_config.hidden_size // self.falcon_config.n_head,
                     self.falcon_config.hidden_size // self.falcon_config.n_head,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.falcon_config.rotary_embedding_meta,
                     name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
-                mha = ffmodel.inc_multiquery_self_attention_verify(
-                    att_norm,
+                o_proj = ffmodel.inc_multiquery_self_attention_verify(
+                    qkv_proj,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
                     self.falcon_config.n_head_kv,
                     self.falcon_config.hidden_size // self.falcon_config.n_head,
                     self.falcon_config.hidden_size // self.falcon_config.n_head,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.falcon_config.rotary_embedding_meta,
                     name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multiquery_self_attention(
-                    att_norm,
+                o_proj = ffmodel.inc_multiquery_self_attention(
+                    qkv_proj,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
                     self.falcon_config.n_head_kv,
                     self.falcon_config.hidden_size // self.falcon_config.n_head,
                     self.falcon_config.hidden_size // self.falcon_config.n_head,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.falcon_config.rotary_embedding_meta,
                     name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
 
+            mha = ffmodel.dense(
+                o_proj,
+                self.falcon_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.self_attention.o_proj"
+            )
+
             dense_h_to_4h = ffmodel.dense(
                 att_norm,
                 self.falcon_config.hidden_size * 4,
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 96f0258572..e149834603 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -19,8 +19,6 @@
 
 class LLAMAConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -29,6 +27,17 @@ def __init__(self, hf_config):
         self.hidden_size = hf_config.hidden_size
         self.rms_norm_eps = hf_config.rms_norm_eps
         self.intermediate_size = hf_config.intermediate_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(
+            apply_rotary_embedding=True,
+            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+        )
+        if "rope_scaling" in hf_config.__dict__:
+            if hf_config.rope_scaling is not None:
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
+                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = (
@@ -55,11 +64,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.llama_config = LLAMAConfig(hf_config)
-        # self.llama_config.max_seq_length = max_seq_length
-        # self.llama_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2 ** 31 - 1
@@ -128,9 +134,17 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.input_layernorm",
                 )
 
+            qkv_proj = ffmodel.dense(
+                attn_norm,
+                3 * self.llama_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.self_attn.qkv_proj",
+            )
+
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
                 mha = ffmodel.spec_inc_multiquery_self_attention(
-                    attn_norm,
+                    qkv_proj,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
                     self.llama_config.num_key_value_heads,
@@ -139,17 +153,15 @@ def build_model(self, max_tokens_per_batch):
                     self.llama_config.hidden_size
                     // self.llama_config.num_attention_heads,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.llama_config.rotary_embedding_meta,
                     name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
-                    attn_norm,
+                    qkv_proj,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
                     self.llama_config.num_key_value_heads,
@@ -158,17 +170,15 @@ def build_model(self, max_tokens_per_batch):
                     self.llama_config.hidden_size
                     // self.llama_config.num_attention_heads,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.llama_config.rotary_embedding_meta,
                     name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
-                    attn_norm,
+                    qkv_proj,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
                     self.llama_config.num_key_value_heads,
@@ -177,20 +187,26 @@ def build_model(self, max_tokens_per_batch):
                     self.llama_config.hidden_size
                     // self.llama_config.num_attention_heads,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.llama_config.rotary_embedding_meta,
                     name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
 
+            o_proj = ffmodel.dense(
+                mha,
+                self.llama_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.self_attn.o_proj"
+            )
+
             token, ff_norm = ffmodel.residual_rms_norm(
                 token,
-                mha,
+                o_proj,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
                 name=f"layers.{i}.post_attention_layernorm",
@@ -259,3 +275,7 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowLLAMA.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
+        # LM head weight
+        model.lm_head.weight.detach().cpu().numpy().tofile(
+            os.path.join(dst_folder, "lm_head.weight")
+        )
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index b350ae106d..a0e70b381a 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -19,8 +19,6 @@
 
 class MPTConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -28,6 +26,7 @@ def __init__(self, hf_config):
         self.n_heads = hf_config.n_heads
         self.n_layers = hf_config.n_layers
         self.vocab_size = hf_config.vocab_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.n_heads
         self.num_key_value_heads = hf_config.n_heads
@@ -50,11 +49,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.mpt_config = MPTConfig(hf_config)
-        # self.mpt_config.max_seq_length = max_seq_length
-        # self.mpt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -129,20 +125,26 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.norm_1",
                 )
 
+            qkv_proj = ffmodel.dense(
+                layernorm_output,
+                3 * self.mpt_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.attn.qkv_proj",
+            )
+
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
-                attn_outputs = ffmodel.spec_inc_multihead_self_attention(
-                    layernorm_output,
+                o_proj = ffmodel.spec_inc_multihead_self_attention(
+                    qkv_proj,
                     self.mpt_config.hidden_size,
                     self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
@@ -151,19 +153,17 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
-                attn_outputs = ffmodel.inc_multihead_self_attention_verify(
-                    layernorm_output,
+                o_proj = ffmodel.inc_multihead_self_attention_verify(
+                    qkv_proj,
                     self.mpt_config.hidden_size,
                     self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
@@ -172,19 +172,17 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                attn_outputs = ffmodel.inc_multihead_self_attention(
-                    layernorm_output,
+                o_proj = ffmodel.inc_multihead_self_attention(
+                    qkv_proj,
                     self.mpt_config.hidden_size,
                     self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     0.0,  # dropout
-                    False,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
@@ -195,6 +193,14 @@ def build_model(self, max_tokens_per_batch):
             else:
                 assert False
 
+            attn_outputs = ffmodel.dense(
+                o_proj,
+                self.mpt_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.attn.o_proj"
+            )
+
             hidden_states, layernorm_output = ffmodel.residual_layer_norm(
                 attn_outputs,
                 hidden_states,
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 02668abf59..ba2e21b690 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -34,6 +34,7 @@ def __init__(self, hf_config):
         self.num_hidden_layers = hf_config.num_hidden_layers
         self.vocab_size = hf_config.vocab_size
         self.word_embed_proj_dim = hf_config.word_embed_proj_dim
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = hf_config.num_attention_heads
@@ -47,8 +48,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -56,11 +55,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.opt_config = OPTConfig(hf_config)
-        # self.opt_config.max_seq_length = max_seq_length
-        # self.opt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -145,20 +141,26 @@ def build_model(self, max_tokens_per_batch):
                 hidden_states = ffmodel.add(token, positional_embedding)
                 residual = hidden_states
 
+            qkv_proj = ffmodel.dense(
+               hidden_states,
+                3 * self.opt_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                True,
+                name=f"layers.{i}.self_attn.qkv_proj",
+            )
+
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
-                mha = ffmodel.spec_inc_multihead_self_attention(
-                    hidden_states,
+                o_proj = ffmodel.spec_inc_multihead_self_attention(
+                    qkv_proj,
                     self.opt_config.hidden_size,
                     self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     0.0,  # dropout
-                    True,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
@@ -166,19 +168,17 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
-                mha = ffmodel.inc_multihead_self_attention_verify(
-                    hidden_states,
+                o_proj = ffmodel.inc_multihead_self_attention_verify(
+                    qkv_proj,
                     self.opt_config.hidden_size,
                     self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     0.0,  # dropout
-                    True,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
@@ -186,19 +186,17 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multihead_self_attention(
-                    hidden_states,
+                o_proj = ffmodel.inc_multihead_self_attention(
+                    qkv_proj,
                     self.opt_config.hidden_size,
                     self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     0.0,  # dropout
-                    True,  # qkv_bias
-                    False,  # final_bias
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
@@ -208,6 +206,13 @@ def build_model(self, max_tokens_per_batch):
             else:
                 assert False
 
+            mha = ffmodel.dense(
+                o_proj,
+                self.opt_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.self_attn.o_proj"
+            )
             # This is either a before or after attention LayerNorm. In both cases, we need to compute the LN here.
             residual, ff_norm = ffmodel.add_bias_residual_layer_norm(
                 mha,
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 2d4471201f..dc5faf175f 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -19,8 +19,6 @@
 
 class STARCODERConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -32,6 +30,7 @@ def __init__(self, hf_config):
         self.vocab_size = hf_config.vocab_size
         self.intermediate_size = hf_config.n_inner
         self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -45,8 +44,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -54,11 +51,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.starcoder_config = STARCODERConfig(hf_config)
-        # self.starcoder_config.max_seq_length = max_seq_length
-        # self.starcoder_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -142,9 +136,17 @@ def build_model(self, max_tokens_per_batch):
                 name=f"layers.{i}.ln_1",
             )
 
-            assert self.mode == InferenceMode.INC_DECODING_MODE
-            mha = ffmodel.inc_multiquery_self_attention(
+            qkv_proj = ffmodel.dense(
                 ln_1,
+                3 * self.starcoder_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                True,
+                name=f"layers.{i}.self_attn.qkv_proj",
+            )
+
+            assert self.mode == InferenceMode.INC_DECODING_MODE
+            o_proj = ffmodel.inc_multiquery_self_attention(
+                qkv_proj,
                 self.starcoder_config.hidden_size,
                 self.starcoder_config.num_attention_heads,
                 self.starcoder_config.n_head_kv,
@@ -153,15 +155,21 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size
                 // self.starcoder_config.num_attention_heads,
                 0.0,  # dropout
-                True,  # qkv_bias
-                False,  # final_bias
                 False,  # add_zero_attn
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
-                False,  # apply_rotary_embedding
+                self.starcoder_config.rotary_embedding_meta,
                 name=f"layers.{i}.attn.c_attn",
             )
 
+            mha = ffmodel.dense(
+                o_proj,
+                self.starcoder_config.hidden_size,
+                ActiMode.AC_MODE_NONE,
+                False,
+                name=f"layers.{i}.self_attn.o_proj"
+            )
+
             residual, l2_norm = ffmodel.residual_layer_norm(
                 hidden_states,
                 mha,
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 532dd00198..c6cf656ac0 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1209,12 +1209,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1224,18 +1228,23 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor = handle->inc_multihead_self_attention(input,
                                                        embed_dim,
                                                        num_heads,
                                                        kdim,
                                                        vdim,
                                                        dropout,
-                                                       bias,
-                                                       add_bias_kv,
                                                        add_zero_attn,
                                                        data_type,
                                                        kernel_initializer,
-                                                       apply_rotary_embedding,
+                                                       rotary_embedding_meta,
                                                        scaling_query,
                                                        scaling_factor,
                                                        qk_prod_scaling,
@@ -1252,12 +1261,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1267,6 +1280,13 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->spec_inc_multihead_self_attention(input,
                                                 embed_dim,
@@ -1274,12 +1294,10 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
                                                 kdim,
                                                 vdim,
                                                 dropout,
-                                                bias,
-                                                add_bias_kv,
                                                 add_zero_attn,
                                                 data_type,
                                                 kernel_initializer,
-                                                apply_rotary_embedding,
+                                                rotary_embedding_meta,
                                                 scaling_query,
                                                 scaling_factor,
                                                 qk_prod_scaling,
@@ -1296,12 +1314,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1311,6 +1333,13 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->inc_multihead_self_attention_verify(input,
                                                   embed_dim,
@@ -1318,12 +1347,10 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
                                                   kdim,
                                                   vdim,
                                                   dropout,
-                                                  bias,
-                                                  add_bias_kv,
                                                   add_zero_attn,
                                                   data_type,
                                                   kernel_initializer,
-                                                  apply_rotary_embedding,
+                                                  rotary_embedding_meta,
                                                   scaling_query,
                                                   scaling_factor,
                                                   qk_prod_scaling,
@@ -1341,12 +1368,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1356,6 +1387,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor = handle->inc_multiquery_self_attention(input,
                                                         embed_dim,
                                                         num_q_heads,
@@ -1363,12 +1401,10 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
                                                         kdim,
                                                         vdim,
                                                         dropout,
-                                                        bias,
-                                                        add_bias_kv,
                                                         add_zero_attn,
                                                         data_type,
                                                         kernel_initializer,
-                                                        apply_rotary_embedding,
+                                                        rotary_embedding_meta,
                                                         scaling_query,
                                                         scaling_factor,
                                                         qk_prod_scaling,
@@ -1386,12 +1422,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1401,6 +1441,13 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->spec_inc_multiquery_self_attention(input,
                                                  embed_dim,
@@ -1409,12 +1456,10 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
                                                  kdim,
                                                  vdim,
                                                  dropout,
-                                                 bias,
-                                                 add_bias_kv,
                                                  add_zero_attn,
                                                  data_type,
                                                  kernel_initializer,
-                                                 apply_rotary_embedding,
+                                                 rotary_embedding_meta,
                                                  scaling_query,
                                                  scaling_factor,
                                                  qk_prod_scaling,
@@ -1432,12 +1477,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     int kdim,
     int vdim,
     float dropout,
-    bool bias,
-    bool add_bias_kv,
     bool add_zero_attn,
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1447,6 +1496,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->inc_multiquery_self_attention_verify(input,
                                                    embed_dim,
@@ -1455,12 +1511,10 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
                                                    kdim,
                                                    vdim,
                                                    dropout,
-                                                   bias,
-                                                   add_bias_kv,
                                                    add_zero_attn,
                                                    data_type,
                                                    kernel_initializer,
-                                                   apply_rotary_embedding,
+                                                   rotary_embedding_meta,
                                                    scaling_query,
                                                    scaling_factor,
                                                    qk_prod_scaling,
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 7a1da2e974..7bfbe31aad 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -670,8 +670,18 @@ void AddBiasResidualLayerNorm::inference_task(
   AddBiasResidualLayerNormMeta *m =
       *((AddBiasResidualLayerNormMeta **)task->local_args);
 
-  assert(regions.size() ==
-         4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
+  int expected_regions =
+      5; // input, attn_bias, residual (input), added_output, output
+  if (m->inplace_residual) {
+    expected_regions--; // input == added_output
+  }
+  if (m->elementwise_affine) {
+    expected_regions += 1; // gamma
+    if (m->use_bias) {
+      expected_regions += 1; // beta
+    }
+  }
+  assert(regions.size() == expected_regions);
 
   int rid = 0, tid = 0, did = 0;
   GenericTensorAccessorR input =
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 9f826cd611..2cede662f3 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -439,21 +439,13 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
+        assert(fused->op_num_weights[op] == 0);
         IncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
             bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -463,21 +455,13 @@ __host__ void
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
         TreeVerifyBatchConfig const &tree_bc =
             Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
+        assert(fused->op_num_weights[op] == 0);
         TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
             &tree_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -489,21 +473,13 @@ __host__ void
         //     (BeamSearchBatchConfig *)task->args;
         BeamSearchBatchConfig const &beam_bc =
             Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
+        assert(fused->op_num_weights[op] == 0);
         SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
             &beam_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_LAYERNORM: {
@@ -1025,21 +1001,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(fused->op_num_outputs[op] == 1);
         IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
+        assert(fused->op_num_weights[op] == 0);
         IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
             m,
             bc,
             task->index_point.point_data[0],
             my_input_grad_accessor[0],
-            my_weight_accessor[0],
-            my_output_grad_accessor[0],
-            biases);
+            my_output_grad_accessor[0]);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 8f1212beb4..5aed2cd69a 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -448,73 +448,49 @@ __host__ void
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
         IncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
             bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         TreeIncMultiHeadSelfAttentionMeta *m =
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
         TreeVerifyBatchConfig const &tree_bc =
             Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
         TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
             &tree_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         SpecIncMultiHeadSelfAttentionMeta const *m =
             (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
         // BeamSearchBatchConfig const *beam_bc =
         //     (BeamSearchBatchConfig *)task->args;
         BeamSearchBatchConfig const &beam_bc =
             Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
         SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
             &beam_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_LAYERNORM: {
@@ -666,12 +642,7 @@ __host__ void
         assert(false && "Fusion currently does not support type");
       }
     }
-    if (metas->meta[op]->inference_debugging &&
-        !(fused->op_op_type[op] == OP_ALLREDUCE ||
-          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
-          fused->op_op_type[op] == OP_REPLICATE ||
-          fused->op_op_type[op] == OP_REPARTITION ||
-          fused->op_op_type[op] == OP_COMBINE)) {
+    if (metas->meta[op]->inference_debugging) {
       std::vector<GenericTensorAccessorR> input_accessors_to_save;
       std::vector<GenericTensorAccessorR> weight_accessors_to_save;
       std::vector<GenericTensorAccessorR> output_accessors_to_save;
@@ -1048,21 +1019,15 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(fused->op_num_outputs[op] == 1);
         IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        assert(fused->op_num_weights[op] == 0);
         GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
         IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
             m,
             bc,
             task->index_point.point_data[0],
             my_input_grad_accessor[0],
-            my_weight_accessor[0],
-            my_output_grad_accessor[0],
-            biases);
+            my_output_grad_accessor[0]);
+        // biases);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 8219cf9e1f..8dbce00ebc 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -54,23 +54,22 @@ bool IncMultiHeadSelfAttentionParams::is_valid(
   return is_valid;
 }
 
-Tensor FFModel::inc_multihead_self_attention(const Tensor input,
-                                             int embed_dim,
-                                             int num_heads,
-                                             int kdim,
-                                             int vdim,
-                                             float dropout,
-                                             bool qkv_bias,
-                                             bool final_bias,
-                                             bool add_zero_attn,
-                                             DataType data_type,
-                                             Initializer *kernel_initializer,
-                                             bool apply_rotary_embedding,
-                                             bool scaling_query,
-                                             float scaling_factor,
-                                             bool qk_prod_scaling,
-                                             bool position_bias,
-                                             char const *name) {
+Tensor FFModel::inc_multihead_self_attention(
+    const Tensor input,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    char const *name) {
   return inc_multiquery_self_attention(input,
                                        embed_dim,
                                        num_heads,
@@ -78,12 +77,10 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                        kdim,
                                        vdim,
                                        dropout,
-                                       qkv_bias,
-                                       final_bias,
                                        add_zero_attn,
                                        data_type,
                                        kernel_initializer,
-                                       apply_rotary_embedding,
+                                       rotary_embedding_meta,
                                        scaling_query,
                                        scaling_factor,
                                        qk_prod_scaling,
@@ -91,31 +88,29 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                        name);
 }
 
-Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
-                                              int embed_dim,
-                                              int num_q_heads,
-                                              int num_kv_heads,
-                                              int kdim,
-                                              int vdim,
-                                              float dropout,
-                                              bool qkv_bias,
-                                              bool final_bias,
-                                              bool add_zero_attn,
-                                              DataType data_type,
-                                              Initializer *kernel_initializer,
-                                              bool apply_rotary_embedding,
-                                              bool scaling_query,
-                                              float scaling_factor,
-                                              bool qk_prod_scaling,
-                                              bool position_bias,
-                                              char const *name) {
+Tensor FFModel::inc_multiquery_self_attention(
+    const Tensor input,
+    int embed_dim,
+    int num_q_heads,
+    int num_kv_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
   DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE;
   bool offload = cpu_offload;
   Layer *li = nullptr;
-  int weight_num = (qkv_bias || final_bias) ? 2 : 1;
   if (data_type != input->data_type) {
     Tensor casted_input = cast(input, data_type, "type cast for IncMHA");
     li = new Layer(this,
@@ -123,7 +118,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0,
                    1 /*outputs*/,
                    casted_input);
   } else {
@@ -132,7 +127,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0,
                    1 /*outputs*/,
                    input);
   }
@@ -142,65 +137,30 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
     for (int i = 0; i < numdims; i++) {
       dims[i] = input->dims[i];
     }
-    dims[0] = embed_dim;
+    dims[0] = vdim * num_q_heads; // we now output o_proj_dim * o_heads
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
-  // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
 
-  // allocate num_q_heads for key, value for replication
-  int weight_size = qParas * num_q_heads + kParas * num_q_heads +
-                    vParas * num_q_heads + oParas * num_q_heads;
-  int one_head_size = qParas + kParas + vParas + oParas;
-
-  {
-    // compress the weight size if quantization.
-    if (quantization_type != DT_NONE) {
-      one_head_size = get_quantization_to_byte_size(
-          data_type, quantization_type, one_head_size);
-    }
-    int dims[1] = {weight_size};
-    li->weights[0] = create_weight_legion_ordering(
-        1,
-        dims,
-        quantization_type == DT_NONE ? data_type : quantization_type,
-        li,
-        true /*create_grad*/,
-        kernel_initializer,
-        CHOSEN_SYNC_TYPE);
-  }
-  if (qkv_bias || final_bias) {
-    // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
-    li->weights[1] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
   li->data_type = data_type;
   li->add_int_property("embed_dim", embed_dim);
   li->add_int_property("num_q_heads", num_q_heads);
   li->add_int_property("num_kv_heads", num_kv_heads);
   li->add_int_property("kdim", kdim);
   li->add_int_property("vdim", vdim);
-  li->add_int_property("qkv_bias", qkv_bias);
-  li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
@@ -231,14 +191,20 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
   int vdim = value;
   float dropout;
   layer->get_float_property("dropout", dropout);
-  layer->get_int_property("qkv_bias", value);
-  bool qkv_bias = (bool)value;
-  layer->get_int_property("final_bias", value);
-  bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
@@ -264,15 +230,12 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
                                        kdim,
                                        vdim,
                                        dropout,
-                                       qkv_bias,
-                                       final_bias,
                                        add_zero_attn,
-                                       apply_rotary_embedding,
+                                       rotary_embedding_meta,
                                        scaling_query,
                                        scaling_factor,
                                        qk_prod_scaling,
                                        position_bias,
-                                       false /*allocate_weights*/,
                                        quantization_type,
                                        offload,
                                        tensor_parallelism_degree,
@@ -289,15 +252,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     int _kdim,
     int _vdim,
     float _dropout,
-    bool _qkv_bias,
-    bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
@@ -308,13 +268,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1), /*weights*/
+         0,
          1 /*outputs*/,
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
-      qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       qSize(_input->dims[0].size), kSize(_input->dims[0].size),
       vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
@@ -334,86 +293,29 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     x *= _input->dims[i].size;
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim
-  assert(dims[0].degree == 1);
-  if (allocate_weights) {
-    // Create weight tensor
-    int num_dims = inputs[0]->num_dims;
-    // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
-    int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
-    ParallelDim dims[2];
-    dims[0] = inputs[0]->dims[num_dims - 2];
-    dims[0].size = dims[0].degree;
-    dims[1] = inputs[0]->dims[num_dims - 1];
-    dims[1].size = this->num_q_heads * (qParas + oParas) +
-                   this->num_q_heads * (kParas + vParas);
-    dims[1].is_replica_dim = false;
-
-    if (quantization_type != DT_NONE) {
-      dims[1].size = get_quantization_to_byte_size(
-          data_type, quantization_type, (qParas + kParas + vParas + oParas));
-    }
-    int seed = std::rand();
-    Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(
-        dims,
-        quantization_type == DT_NONE ? this->data_type : quantization_type,
-        nullptr /*owner_op*/,
-        model.config.computationMode == COMP_MODE_INFERENCE
-            ? false
-            : true /*create_grad*/,
-        initializer,
-        CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
+  // Removed restriction that no parallelism along this dim
+  // assert(dims[0].degree == 1);
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
-  /* for (int i = 0; i < numdim; i++) { */
-  /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-  /* } */
-  /* // Check correctness */
   /* assert(check_output_input_weight_parallel_dims()); */
 }
 
 IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     FFModel &model,
     const ParallelTensor _input,
-    const ParallelTensor _weight,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
     int _kdim,
     int _vdim,
     float _dropout,
-    bool _qkv_bias,
-    bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
@@ -424,14 +326,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1), /*weights*/
+         0,
          1 /*outputs*/,
-         _input,
-         _weight),
+         _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
-      qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       qSize(_input->dims[0].size), kSize(_input->dims[0].size),
       vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
@@ -439,9 +339,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
       quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree)
-// bias_initializer(_bias_initializer)
-{
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   numOutputs = 1;
   int numdim = _input->num_dims;
   ParallelDim dims[MAX_TENSOR_DIM];
@@ -451,63 +349,10 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
   dims[0].size = _embed_dim;
   // Currently require no parallelism along this dim
   assert(dims[0].degree == 1);
-  if (allocate_weights) {
-    // Create weight tensor
-    int num_dims = inputs[0]->num_dims;
-    // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
-    int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
-    ParallelDim dims[2];
-    dims[0] = inputs[0]->dims[num_dims - 2];
-    dims[0].size = dims[0].degree;
-    dims[1] = inputs[0]->dims[num_dims - 1];
-    dims[1].size = this->num_q_heads * (qParas + oParas) +
-                   this->num_q_heads * (kParas + vParas);
-    dims[1].is_replica_dim = false;
-    // dims[2].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads
-    // * (kParas + vParas);
-    if (quantization_type != DT_NONE) {
-      dims[1].size = get_quantization_to_byte_size(
-          data_type, quantization_type, (qParas + kParas + vParas + oParas));
-    }
-    int seed = std::rand();
-    Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(
-        dims,
-        quantization_type == DT_NONE ? this->data_type : quantization_type,
-        NULL /*owner_op*/,
-        true /*create_grad*/,
-        initializer,
-        CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
 
-  /* for (int i = 0; i < numdim; i++) { */
-  /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-  /* } */
-  /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */
-  /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */
   // Check correctness
   /* assert(check_output_input_weight_parallel_dims()); */
 }
@@ -515,8 +360,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
 IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     FFModel &model,
     IncMultiHeadSelfAttention const &other,
-    const ParallelTensor input,
-    bool allocate_weights)
+    const ParallelTensor input)
     : IncMultiHeadSelfAttention(model,
                                 other.layer_guid,
                                 input,
@@ -526,15 +370,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 other.qProjSize,
                                 other.vProjSize,
                                 other.dropout,
-                                other.qkv_bias,
-                                other.final_bias,
                                 other.add_zero_attn,
-                                other.apply_rotary_embedding,
+                                other.rotary_embedding_meta,
                                 other.scaling_query,
                                 other.scaling_factor,
                                 other.qk_prod_scaling,
                                 other.position_bias,
-                                allocate_weights,
                                 other.quantization_type,
                                 other.offload,
                                 other.tensor_parallelism_degree,
@@ -544,7 +385,6 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     FFModel &model,
     IncMultiHeadSelfAttentionParams const &params,
     ParallelTensor const &input,
-    bool allocate_weights,
     char const *name)
     : IncMultiHeadSelfAttention(model,
                                 params.layer_guid,
@@ -555,15 +395,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 params.kdim,
                                 params.vdim,
                                 params.dropout,
-                                params.qkv_bias,
-                                params.final_bias,
                                 params.add_zero_attn,
-                                params.apply_rotary_embedding,
+                                params.rotary_embedding_meta,
                                 params.scaling_query,
                                 params.scaling_factor,
                                 params.qk_prod_scaling,
                                 params.position_bias,
-                                allocate_weights,
                                 params.quantization_type,
                                 params.offload,
                                 params.tensor_parallelism_degree,
@@ -596,20 +433,12 @@ void IncMultiHeadSelfAttention::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -636,18 +465,12 @@ void IncMultiHeadSelfAttention::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -655,8 +478,7 @@ void IncMultiHeadSelfAttention::init(FFModel const &ff) {
 
 /*
   regions[0](I): input
-  regions[1](I): weight
-  regions[2](O): output
+  regions[1](O): output
 */
 OpMeta *IncMultiHeadSelfAttention::init_task(
     Task const *task,
@@ -675,17 +497,10 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR weight =
-      helperGetGenericTensorAccessorRO(attn->weights[0]->data_type,
-                                       regions[1],
-                                       task->regions[1],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type,
-                                       regions[2],
-                                       task->regions[2],
+                                       regions[1],
+                                       task->regions[1],
                                        FID_DATA,
                                        ctx,
                                        runtime);
@@ -698,8 +513,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
-
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   if (attn->offload) {
@@ -708,14 +521,8 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
     gpu_mem_allocator.register_reserved_work_space(
         handle.offload_reserve_space, handle.offload_reserve_space_size);
   }
-  IncMultiHeadSelfAttentionMeta *m =
-      new IncMultiHeadSelfAttentionMeta(handle,
-                                        attn,
-                                        weight,
-                                        gpu_mem_allocator,
-                                        num_samples,
-                                        num_q_heads,
-                                        num_kv_heads);
+  IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta(
+      handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads);
   if (handle.offload_reserve_space == nullptr) {
     // assert that we didn't over allocate memory
     assert(gpu_mem_allocator.reserved_allocated_size ==
@@ -725,10 +532,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
   m->inference_debugging = attn->inference_debugging;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
-  if (attn->quantization_type == DT_NONE) {
-    assert(weight.domain.get_volume() * data_type_size(weight.data_type) ==
-           m->weightSize);
-  }
 
   return m;
 }
@@ -770,14 +573,6 @@ FutureMap IncMultiHeadSelfAttention::inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
@@ -785,23 +580,12 @@ FutureMap IncMultiHeadSelfAttention::inference(
                                                     batch_outputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
 
-  if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(
-        RegionRequirement(weights[1]->part,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          weights[1]->region,
-                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-    launcher.add_field(idx++, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
   regions[0](I): input
-  regions[3](I): weight
-  regions[4](O): output
+  regions[1](O): output
 */
 void IncMultiHeadSelfAttention::inference_task(
     Task const *task,
@@ -822,54 +606,31 @@ void IncMultiHeadSelfAttention::inference_task(
   IncMultiHeadSelfAttentionMeta *m =
       *((IncMultiHeadSelfAttentionMeta **)task->local_args);
 
-  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
-                                           : regions.size() == 3));
+  assert(regions.size() == 2); // input and output
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR biases;
-  if (*m->qkv_bias || *m->final_bias) {
-    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                              regions[3],
-                                              task->regions[3],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime);
-    Domain bias_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    assert(bias_domain.get_dim() == 4);
-  }
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain weight_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[1].region.get_index_space());
 
   assert(input_domain.get_dim() == 4);
-  assert(weight_domain.get_dim() == 2);
   assert(output_domain.get_dim() == 4);
 
   assert(task->index_point.get_dim() == 1);
 
   IncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, bc, task->index_point.point_data[0], input, output);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(weight);
-    if (*m->qkv_bias || *m->final_bias) {
-      weights_accessors.push_back(biases);
-    }
     IncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, bc, {input}, weights_accessors, {output});
+        m, shard_id, bc, {input}, {}, {output});
   }
 }
 
@@ -903,14 +664,6 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd(
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(
       RegionRequirement(batch_outputs[0]->part_grad,
                         0 /*projection id*/,
@@ -918,23 +671,12 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd(
                         EXCLUSIVE,
                         batch_outputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
-  if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(
-        RegionRequirement(weights[1]->part,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          weights[1]->region,
-                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-    launcher.add_field(idx++, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
   regions[0](I): input
-  regions[3](I): weight
-  regions[4](O): output
+  regions[1](O): output
 */
 void IncMultiHeadSelfAttention::peft_bwd_task(
     Task const *task,
@@ -954,55 +696,31 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
   IncMultiHeadSelfAttentionMeta *m =
       *((IncMultiHeadSelfAttentionMeta **)task->local_args);
 
-  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
-                                           : regions.size() == 3));
+  assert(regions.size() == 2); // input grad, output grad
 
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR biases;
-  if (*m->qkv_bias || *m->final_bias) {
-    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                              regions[3],
-                                              task->regions[3],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime);
-    Domain bias_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    assert(bias_domain.get_dim() == 4);
-  }
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   Domain input_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain weight_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
   Domain output_grad_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[1].region.get_index_space());
 
   assert(input_grad_domain.get_dim() == 4);
-  assert(weight_domain.get_dim() == 2);
   assert(output_grad_domain.get_dim() == 4);
 
   assert(task->index_point.get_dim() == 1);
 
   IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
-      m,
-      bc,
-      task->index_point.point_data[0],
-      input_grad,
-      weight,
-      output_grad,
-      biases);
+      m, bc, task->index_point.point_data[0], input_grad, output_grad);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     IncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
   }
 }
 
@@ -1032,9 +750,20 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
   return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim &&
          lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim &&
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
-         lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -1049,10 +778,8 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.kdim = this->kProjSize;
   params.vdim = this->vProjSize;
   params.dropout = this->dropout;
-  params.qkv_bias = this->qkv_bias;
-  params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -1081,10 +808,15 @@ size_t hash<FlexFlow::IncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.kdim);
   hash_combine(key, params.vdim);
   hash_combine(key, params.dropout);
-  hash_combine(key, params.qkv_bias);
-  hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 826fea4347..a4604a11a2 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -19,6 +19,7 @@
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/hip_helper.h"
 #include "hip/hip_complex.h"
+#include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
@@ -52,6 +53,339 @@ __device__ __forceinline__ T
 #endif
 }
 
+template <typename DT>
+__global__ void store_kv_cache(DT const *devQKVProjArray,
+                               DT *kCache_ptr,
+                               DT *vCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int num_tokens,
+                               int max_seq_len,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+    int const req_id = tokenInfos[token_idx].request_index;
+    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+
+    // key cache
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = vVal;
+  }
+}
+
+template <typename DT>
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+
+    DT qVal = devQKVProjArray[val_idx];
+
+    // query cache
+    qCache_ptr[i] = qVal;
+  }
+}
+
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+  }
+}
+
+template <typename DT>
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
+                                     BatchConfig const *bc,
+                                     int shard_id,
+                                     hipStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  hipblasDatatype_t compute_type = cublas_data_type;
+
+  int num_tokens = bc->num_active_tokens();
+  int tokens_previous_requests = 0;
+  int q_block_size = m->qProjSize;
+  int kt_block_size = m->kProjSize;
+  int kt_req_block_size =
+      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+  int vt_block_size = m->vProjSize;
+  int vt_req_block_size =
+      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+  assert(m->qProjSize == m->kProjSize);
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
+      continue;
+    }
+    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                       bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
+      }
+      int parallelism = m->hidden_size * num_tokens;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         static_cast<DT *>(m->devQKVProjArray),
+                         static_cast<DT *>(m->query_activation_buffer),
+                         num_tokens,
+                         m->hidden_size);
+    }
+    // Step 1: compute query-key product QK.T/sqrt(d_k)
+    {
+      // Scale by sqrt(d_k) as per the original attention paper
+      DT alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+      }
+      // after transpositions
+      int m_ = num_new_tokens;
+      int n = total_tokens;
+      int k = m->qProjSize;
+      // before transpositions
+      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
+          ldc = m_;
+      // N.B. strides are applied before transpose operations
+      int strideA = q_block_size;
+      int strideB = kt_block_size;
+      int strideC = num_new_tokens * total_tokens;
+
+      // matrix A: devQKVProjArray
+      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
+      // To get query projection, skip over Q entries from previous requests
+      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                    bc->requestsInfo[i].first_token_offset_in_batch *
+                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
+      // matrix B: key cache
+      // matrix B's layout: [kProjSize * num_heads, total_tokens]
+      // To get B, skip over K entries from previous requests (all heads +
+      // padding)
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: qk_prods
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get C, skip over QK.T products from previous requests
+      DT *C = static_cast<DT *>(m->qk_prods);
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_N,
+                                            m_,
+                                            n,
+                                            k,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+    }
+    // Step 2: Add alibi position bias to qk production
+    // matrix C: qk_prods
+    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+    // To get C, skip over QK.T products from previous requests
+    DT *C = static_cast<DT *>(m->qk_prods);
+    if (*m->position_bias) {
+      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd),
+                         GET_BLOCKS(parallelism),
+                         min((size_t)CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         C,
+                         num_new_tokens,
+                         total_tokens,
+                         m->num_q_heads,
+                         m->global_num_q_heads,
+                         shard_id);
+    }
+
+    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
+    // with -inf to force causal attention.
+    assert(num_new_tokens <= total_tokens);
+    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
+    if (entries_above_diagonal > 0) {
+      size_t parallelism = m->num_q_heads * entries_above_diagonal;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal),
+                         GET_BLOCKS(parallelism),
+                         min((size_t)CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         C,
+                         num_new_tokens,
+                         total_tokens,
+                         m->num_q_heads,
+                         entries_above_diagonal,
+                         static_cast<DT>(-INFINITY));
+    }
+
+    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
+    {
+      // Before modifying the parameters below, make sure to read the following
+      // description of the HIPDNN_TENSOR_NCHW tensor layout, from
+      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t:
+      // This tensor format specifies that the data is laid out in the following
+      // order: batch size, feature maps, rows, columns. The strides are
+      // implicitly defined in such a way that the data are contiguous in memory
+      // with no padding between images, feature maps, rows, and columns; the
+      // columns are the inner dimension and the images are the outermost
+      // dimension.
+      int n_param = m->num_q_heads;
+      int c_param = total_tokens;
+      int h_param = 1;
+      int w_param = num_new_tokens;
+      checkCUDNN(miopenSet4dTensorDescriptor(
+          m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param));
+      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      // The softmax operation below is executed according to the
+      // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+      // softmax operation is computed per spatial location (H,W) per image (N)
+      // across dimension C.
+      checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
+                                         &softmax_alpha,
+                                         m->qk_tensor,
+                                         C,
+                                         &softmax_beta,
+                                         m->qk_tensor,
+                                         C_softmax,
+                                         MIOPEN_SOFTMAX_ACCURATE,
+                                         MIOPEN_SOFTMAX_MODE_CHANNEL));
+    }
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
+      }
+      checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer,
+                               C_softmax,
+                               sizeof(DT) * total_tokens * num_new_tokens *
+                                   m->num_q_heads,
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
+    // softmax(QK.T/sqrt(d_k)).T
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      // after transpositions
+      int m_ = m->vProjSize;
+      int n = num_new_tokens;
+      int k = total_tokens;
+      // before transpositions
+      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      // N.B. strides are applied before transpose operations
+      int strideA = vt_block_size;
+      int strideB = num_new_tokens * total_tokens;
+      int strideC = m->vProjSize;
+      // matrix A: value cache
+      // matrix A's layout: [vProjSize, num_heads, total_tokens]
+      // To get A, skip over V.T entries from previous requests (all heads +
+      // padding)
+      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix B: qk_prods_softmax
+      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      DT *B = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix C: attn heads
+      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
+      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
+      // requests
+      // store the result attn heads, also skip the genration tokens
+      DT *C = static_cast<DT *>(m->attn_heads) +
+              (bc->requestsInfo[i].first_token_offset_in_batch) *
+                  m->num_q_heads * m->vProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_N,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n,
+                                            k,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+    }
+    tokens_previous_requests += num_new_tokens;
+  }
+  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+    bc->print();
+    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
+    printf("num_tokens: %i\n", num_tokens);
+    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
+  }
+  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
+}
+
 // gridDim = num_heads
 // blockDim = num_tokens/num_request * head_size
 // QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
@@ -334,63 +668,6 @@ __global__ void apply_position_bias_qkprd(DT *input_ptr,
   }
 }
 
-template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize) {
-  CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) {
-    int bias_idx = qkv_weight_size + i % oProjSize;
-    input_ptr[i] += bias_ptr[bias_idx];
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int global_num_q_heads,
-                                    int num_q_heads,
-                                    bool scaling_query,
-                                    float scaling_factor,
-                                    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
-    // for simplicity, assume q, k, v is in same shape
-    // 0->q, 1->k, 2->v
-    // int qkv_index = i / (num_tokens * qProjSize) % 3;
-
-    int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
-    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
-
-    int qkv_index = in_token_idx / hidden_size;
-
-    int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
-
-    int head_idx =
-        (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
-    int global_head_idx = head_idx + shard_id * num_q_heads;
-
-    size_t pre_length =
-        qkv_index == 0
-            ? 0
-            : (qkv_index == 1 ? qProjSize * global_num_q_heads
-                              : qProjSize * global_num_q_heads * KV_WEIGHT_NUM);
-
-    size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
-
-    input_ptr[i] += bias_ptr[bias_idx];
-
-    if (scaling_query && qkv_index == 0) {
-      input_ptr[i] *= scaling_factor;
-    }
-  }
-}
-
 template <typename DT>
 __global__ void scaling_query_kernel(DT *input_ptr,
                                      int qProjSize,
@@ -405,60 +682,17 @@ __global__ void scaling_query_kernel(DT *input_ptr,
   }
 }
 
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  hipFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    // complex_input[i].y;
-
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    hipFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = hipCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
 template <typename DT>
 __global__ void
     apply_rotary_embedding_hf(DT *input_ptr,
                               hipFloatComplex *complex_input,
                               BatchConfig::PerTokenInfo const *tokenInfos,
+                              float rope_theta,
+                              bool llama3_rope,
+                              float factor,
+                              float low_freq_factor,
+                              float high_freq_factor,
+                              int original_max_position_embeddings,
                               int qProjSize,
                               int kProjSize,
                               int num_tokens,
@@ -493,7 +727,29 @@ __global__ void
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+
+    float freq =
+        pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i
+
+    if (llama3_rope) {
+      float pi = HIP_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
+      }
+    }
+
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
 
     complex_input[i] = hipCmulf(complex_input[i], complex_pos);
@@ -507,6 +763,12 @@ __global__ void
     apply_rotary_embedding_bwd(DT *input_ptr,
                                hipFloatComplex *complex_input,
                                BatchConfig::PerTokenInfo const *tokenInfos,
+                               float rope_theta,
+                               bool llama3_rope,
+                               float factor,
+                               float low_freq_factor,
+                               float high_freq_factor,
+                               int original_max_position_embeddings,
                                int proj_size,
                                int num_tokens,
                                int hidden_size) {
@@ -533,7 +795,28 @@ __global__ void
 
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    float freq =
+        pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size)); // θ_i
+
+    if (llama3_rope) {
+      float pi = HIP_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
+      }
+    }
+
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
 
     complex_input[i] = hipCmulf(complex_input[i], complex_pos);
@@ -542,172 +825,59 @@ __global__ void
   }
 }
 
-template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
-  }
-}
-
 template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
                         DT *output_ptr,
-                        DT const *bias_ptr,
                         hipStream_t stream) {
 
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
-  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  hipblasDatatype_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   hipblasDatatype_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-
-  // Step 1: Compute QKV projections
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_q = m->qProjSize * m->num_q_heads;
-    int m_k = m->kProjSize * m->num_q_heads;
-    int m_v = m->vProjSize * m->num_q_heads;
-    assert(m_q == m_k && m_k == m_v); // keep things simple for now
-    int n = bc->num_active_infr_tokens();
-    int k = m->qSize;
-    int m_ = m_q * QKV_WEIGHT_NUM;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: QKV weights
-    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
-    // matrix B: input
-    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
-    // matrix C: devQKVProjArray
-    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
-    checkCUDA(hipblasGemmEx(m->handle.blas,
-                            HIPBLAS_OP_T,
-                            HIPBLAS_OP_N,
-                            m_,
-                            n,
-                            k,
-                            &alpha,
-                            weight_ptr,
-                            cublas_data_type,
-                            lda,
-                            input_ptr,
-                            cublas_data_type,
-                            ldb,
-                            &beta,
-                            output_ptr,
-                            cublas_data_type,
-                            ldc,
-                            compute_type,
-                            HIPBLAS_GEMM_DEFAULT));
-  }
 
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
 
-  // Step 2: apply bias for QKV, or scale the query
-  if (*m->qkv_bias) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       shard_id,
-                       num_tokens,
-                       m->qProjSize,
-                       m->kProjSize,
-                       m->vProjSize,
-                       m->global_num_q_heads,
-                       m->num_q_heads,
-                       *m->scaling_query,
-                       m->scaling_factor,
-                       m->hidden_size);
-  } else if (m->scaling_query) {
+  if (m->scaling_query) {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
                        stream,
                        output_ptr,
+                       m->qProjSize,
                        num_tokens,
                        m->num_q_heads,
-                       m->qProjSize,
                        m->scaling_factor,
                        m->hidden_size);
   }
 
   // Step 3: apply rotary embedding if needed
-  if (*m->apply_rotary_embedding) {
+  if (m->rotary_embedding_meta->apply_rotary_embedding) {
     /*q&k*/
     parallelism = num_tokens * m->hidden_size;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       m->complex_input,
-                       m->token_infos,
-                       m->qProjSize,
-                       m->kProjSize,
-                       num_tokens,
-                       q_array_size,
-                       m->hidden_size);
-  }
-}
-
-template <typename DT>
-__global__ void store_kv_cache(DT const *devQKVProjArray,
-                               DT *kCache_ptr,
-                               DT *vCache_ptr,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int num_tokens,
-                               int max_seq_len,
-                               int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    // key cache
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(apply_rotary_embedding_hf),
+        GET_BLOCKS(parallelism),
+        min(CUDA_NUM_THREADS, parallelism),
+        0,
+        stream,
+        output_ptr,
+        m->complex_input,
+        m->token_infos,
+        m->rotary_embedding_meta->rope_theta,
+        (m->rotary_embedding_meta->rope_type == "llama3"),
+        m->rotary_embedding_meta->factor,
+        m->rotary_embedding_meta->low_freq_factor,
+        m->rotary_embedding_meta->high_freq_factor,
+        m->rotary_embedding_meta->original_max_position_embeddings,
+        m->qProjSize,
+        m->kProjSize,
+        num_tokens,
+        q_array_size,
+        m->hidden_size);
   }
 }
 
@@ -723,91 +893,13 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
                        stream,
-                       static_cast<DT *>(m->devQKVProjArray),
-                       static_cast<DT *>(m->keyCache),
-                       static_cast<DT *>(m->valueCache),
-                       m->token_infos,
-                       num_tokens,
-                       BatchConfig::max_sequence_length(),
-                       m->hidden_size);
-  }
-}
-
-template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         hipStream_t stream) {
-  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  hipblasDatatype_t compute_type = HIPBLAS_R_16F;
-#else
-  hipblasDatatype_t compute_type = cublas_data_type;
-#endif
-  // Project to output, save result directly on output tensor
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_ = m->oProjSize;
-    int k = m->vProjSize * m->num_q_heads;
-    int n = num_tokens;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: output projection weight
-    // matrix A's layout: [vProjSize * num_heads, oProjSize]
-    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                           m->kProjSize * m->num_q_heads +
-                                           m->vProjSize * m->num_q_heads);
-    // matrix B: attn heads
-    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
-    DT const *B = static_cast<DT *>(m->attn_heads);
-    // matrix B: output
-    // matrix B's layout: [oProjSize, num_new_tokens]
-    DT *C = static_cast<DT *>(output_ptr);
-
-    checkCUDA(hipblasGemmEx(m->handle.blas,
-                            HIPBLAS_OP_T,
-                            HIPBLAS_OP_N,
-                            m_,
-                            n,
-                            k,
-                            &alpha,
-                            A,
-                            cublas_data_type,
-                            lda,
-                            B,
-                            cublas_data_type,
-                            ldb,
-                            &beta,
-                            C,
-                            cublas_data_type,
-                            ldc,
-                            compute_type,
-                            HIPBLAS_GEMM_DEFAULT));
-  }
-  // Add final output bias
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
+                       static_cast<DT *>(m->devQKVProjArray),
+                       static_cast<DT *>(m->keyCache),
+                       static_cast<DT *>(m->valueCache),
+                       m->token_infos,
                        num_tokens,
-                       qkv_weight_size,
-                       m->oProjSize);
+                       BatchConfig::max_sequence_length(),
+                       m->hidden_size);
   }
 }
 
@@ -856,93 +948,43 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             hipStream_t stream) {
-  // additional processing for weight uploading
-  // Note that we update weight_ptr and bias_ptr when uploading weight and
-  // bias
-  if (m->quantization_type != DT_NONE) {
-    // copy weight_ptr to quantized_weight_ptr, do compression and store in
-    // m->weight_ptr
-    checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr,
-                             weight.get_byte_ptr(),
-                             m->quantized_weightSize,
-                             hipMemcpyHostToDevice,
-                             stream));
-
-    if (m->quantization_type == DT_INT4) {
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int4_attention_weights),
-                         GET_BLOCKS(parallelism),
-                         min(CUDA_NUM_THREADS, parallelism),
-                         0,
-                         stream,
-                         m->quantized_weight_ptr,
-                         static_cast<DT *>(m->weight_ptr),
-                         m->qProjSize,
-                         m->qSize,
-                         m->num_q_heads);
-    } else {
-      assert(m->quantization_type == DT_INT8);
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int8_attention_weights),
-                         GET_BLOCKS(parallelism),
-                         min(CUDA_NUM_THREADS, parallelism),
-                         0,
-                         stream,
-                         m->quantized_weight_ptr,
-                         static_cast<DT *>(m->weight_ptr),
-                         m->qProjSize,
-                         m->qSize,
-                         m->num_q_heads);
-    }
-  } else {
-    if (data_type == DT_FLOAT) {
-      checkCUDA(hipMemcpyAsync(m->weight_ptr,
-                               weight.get_float_ptr(),
-                               m->weightSize,
-                               hipMemcpyHostToDevice,
-                               stream));
-    } else if (data_type == DT_HALF) {
-      checkCUDA(hipMemcpyAsync(m->weight_ptr,
-                               weight.get_half_ptr(),
-                               m->weightSize,
-                               hipMemcpyHostToDevice,
-                               stream));
-    } else {
-      assert(false);
-    }
+std::string get_fwd_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                               int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
   }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
 }
 
 template <typename DT>
 void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
+                      DT const *qkv_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       hipStream_t stream) {
 
-  if (m->offload && m->biasSize > 0) {
-    checkCUDA(hipMemcpyAsync(
-        m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
-    bias_ptr = static_cast<DT *>(m->bias_ptr);
-  }
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
-  // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  hipMemcpyAsync(m->devQKVProjArray,
+                 qkv_ptr,
+                 qkv_proj_size * sizeof(DT),
+                 hipMemcpyDeviceToDevice,
+                 stream);
+
+  // phase 1: Implement kernel to apply rotary embedding and scaling
+  compute_qkv_kernel(
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
   if (bc->num_generation_tokens > 0) {
@@ -953,14 +995,16 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
 
   if (bc->num_tokens > bc->num_generation_tokens) {
     // phase 4: Compute attention score for prompt tokens;
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, bias_ptr, weight_ptr, stream);
+    compute_attention_kernel_prompt<DT>(m, bc, shard_id, stream);
   }
 
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
-  compute_o_prod_bias(
-      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+  hipMemcpyAsync(output_ptr,
+                 m->attn_heads,
+                 m->oProjSize * num_tokens * sizeof(DT),
+                 hipMemcpyDeviceToDevice,
+                 stream);
 }
 
 std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
@@ -978,14 +1022,75 @@ std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
   return dst_filepath.string();
 }
 
+__global__ void transposeAdd_half_kernel(
+    half *out, half const *in, int width, int height, half alpha, half beta) {
+  int t_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (int i = t_id; i < width * height; i += num_threads) {
+    int row = i / width;
+    int col = i % width;
+    out[col * height + row] =
+        alpha * in[row * width + col] + beta * out[col * height + row];
+  }
+}
+
+__global__ void transposeAdd_float_kernel(float *out,
+                                          float const *in,
+                                          int width,
+                                          int height,
+                                          float alpha,
+                                          float beta) {
+  int t_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (int i = t_id; i < width * height; i += num_threads) {
+    int row = i / width;
+    int col = i % width;
+    out[col * height + row] =
+        alpha * in[row * width + col] + beta * out[col * height + row];
+  }
+}
+
+template <typename DT>
+void transposeAdd(DT *out,
+                  const DT *in,
+                  int width,
+                  int height,
+                  float alpha,
+                  float beta,
+                  hipStream_t stream) {
+  assert(false && "Unsupported data type");
+}
+
+template <>
+void transposeAdd<float>(float *out,
+                         float const *in,
+                         int width,
+                         int height,
+                         float alpha,
+                         float beta,
+                         hipStream_t stream) {
+  transposeAdd_float_kernel<<<4, 1024, 0, stream>>>(
+      out, in, width, height, alpha, beta);
+}
+
+template <>
+void transposeAdd<half>(half *out,
+                        half const *in,
+                        int width,
+                        int height,
+                        float alpha,
+                        float beta,
+                        hipStream_t stream) {
+  transposeAdd_half_kernel<<<4, 1024, 0, stream>>>(
+      out, in, width, height, __float2half(alpha), __float2half(beta));
+}
+
 template <typename DT>
 void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      BatchConfig const *bc,
                      int shard_id,
                      DT *input_grad_ptr,
-                     DT const *weight_ptr,
                      DT const *output_grad_ptr,
-                     DT const *bias_ptr,
                      hipStream_t stream) {
   assert(!m->offload);
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
@@ -994,17 +1099,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   hipblasDatatype_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   hipblasDatatype_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
@@ -1026,47 +1120,18 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int vt_req_block_size =
         vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
     assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
-    // Step 1: compute gradients before final projection
+    // Step 1: copy gradient before final projection into workspace
     {
       int m_ = m->vProjSize * m->num_q_heads;
       int n_ = num_tokens;
-      int k_ = m->oProjSize;
-      int lda = m_;
-      int ldb = k_;
-      int ldc = m_;
-      float alpha = 1.0f, beta = 0.0f;
-      // matrix A: output projection weight
-      // matrix A's layout: [vProjSize * num_heads, oProjSize]
-      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                             m->kProjSize * m->num_q_heads +
-                                             m->vProjSize * m->num_q_heads);
-      // matrix B: output gradients
-      // matrix B's layout: [oProjSize, num_new_tokens]
-      DT const *B =
-          output_grad_ptr +
-          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
-      // matrix C: attn_heads gradients
-      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
       DT *C = static_cast<DT *>(m->handle.workSpace);
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_N,
-                              HIPBLAS_OP_N,
-                              m_,
-                              n_,
-                              k_,
-                              &alpha,
-                              A,
-                              cublas_data_type,
-                              lda,
-                              B,
-                              cublas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              cublas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
+      hipMemcpyAsync(C,
+                     output_grad_ptr +
+                         bc->requestsInfo[i].first_token_offset_in_batch *
+                             m->oProjSize,
+                     m_ * n_ * sizeof(DT),
+                     hipMemcpyDeviceToDevice,
+                     stream);
       if (m->inference_debugging) {
         // save result to file for checking
         std::string filename =
@@ -1331,264 +1396,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       int lda = num_tokens; // num_new_tokens
       int ldb = m->qProjSize * m->num_q_heads;
       int ldc = num_tokens;
-      int strideA = num_tokens * num_tokens;
-      int strideB = m->qProjSize;
-      int strideC = num_tokens * m->qProjSize;
-      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_N,
-                                            HIPBLAS_OP_T,
-                                            m_,
-                                            n_,
-                                            k_,
-                                            &alpha,
-                                            A,
-                                            cublas_data_type,
-                                            lda,
-                                            strideA,
-                                            B,
-                                            cublas_data_type,
-                                            ldb,
-                                            strideB,
-                                            &beta,
-                                            C,
-                                            cublas_data_type,
-                                            ldc,
-                                            strideC,
-                                            m->num_q_heads,
-                                            compute_type,
-                                            HIPBLAS_GEMM_DEFAULT));
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
-        save_tensor(C,
-                    num_tokens * m->qProjSize * m->num_q_heads * 3,
-                    filename.c_str());
-      }
-    }
-
-    // Step 7: perform rotary position embeddings (RoPE) bwd
-    {
-      if (*m->apply_rotary_embedding) {
-        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
-        assert(m->qProjSize == m->kProjSize);
-        /*q&k*/
-        int parallelism = num_tokens * m->hidden_size;
-        DT *A = static_cast<DT *>(m->devQKVProjArray);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_bwd),
-                           GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream,
-                           A,
-                           m->complex_input,
-                           m->token_infos,
-                           m->qProjSize,
-                           num_tokens,
-                           m->hidden_size);
-        DT *C = static_cast<DT *>(m->devQKVProjArray);
-        if (m->inference_debugging) {
-          std::string filename =
-              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
-          save_tensor(C,
-                      num_tokens * m->qProjSize * m->num_q_heads * 3,
-                      filename.c_str());
-        }
-      }
-
-      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C =
-          static_cast<DT *>(m->devQKVProjArray) +
-          num_tokens *
-              (m->qProjSize *
-               m->num_q_heads); // skip over regions reserved for Q gradients
-      if (m->inference_debugging) {
-        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
-        save_tensor(
-            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
-      }
-    }
-
-    // Step 8: compute gradients w.r.t. input
-    {
-      float alpha = 1.0f, beta = 0.0f;
-      if (!m->reset_input_grads[0]) {
-        beta = 1.0f;
-      }
-      // matrix A: QKV projection weights
-      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
-      DT const *A = weight_ptr;
-      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
-      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT const *B = static_cast<DT *>(m->devQKVProjArray);
-      // matrix C: gradients w.r.t. input
-      // matrix C's layout: [m->qSize, num_tokens]
-      DT *C = input_grad_ptr +
-              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
-      int m_ = m->qSize;
-      int n_ = num_tokens;
-      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
-      int lda = m_;
-      int ldb = n_;
-      int ldc = m_;
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_N,
-                              HIPBLAS_OP_T,
-                              m_,
-                              n_,
-                              k_,
-                              &alpha,
-                              A,
-                              cublas_data_type,
-                              lda,
-                              B,
-                              cublas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              cublas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
-        save_tensor(C, num_tokens * m->qSize, filename.c_str());
-      }
-    }
-  }
-}
-
-} // namespace IncMultiHeadAttention
-} // namespace Kernels
-
-using namespace Kernels::IncMultiHeadAttention;
-
-template <typename DT>
-__global__ void store_query_cache(DT const *devQKVProjArray,
-                                  DT *qCache_ptr,
-                                  int num_tokens,
-                                  int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-
-    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
-
-    DT qVal = devQKVProjArray[val_idx];
-
-    // query cache
-    qCache_ptr[i] = qVal;
-  }
-}
-
-template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
-                                     BatchConfig const *bc,
-                                     int shard_id,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
-                                     hipStream_t stream) {
-  checkCUDA(hipblasSetStream(m->handle.blas, stream));
-  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
-  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-  hipblasDatatype_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   hipblasDatatype_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
-  int tokens_previous_requests = 0;
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] ||
-        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
-      continue;
-    }
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
-                       bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
-    // Copy query to m->query_activation_buffer if we need to compute
-    // PEFT backward
-    if (bc->requestsInfo[i].peft_bwd) {
-      size_t activation_size_needed =
-          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
-      if (activation_size_needed > m->allocated_peft_buffer_size1) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->query_activation_buffer =
-            allocator->allocate_instance_untyped(activation_size_needed);
-        m->allocated_peft_buffer_size1 = activation_size_needed;
-      }
-      int parallelism = m->hidden_size * num_tokens;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache),
-                         GET_BLOCKS(parallelism),
-                         min(CUDA_NUM_THREADS, parallelism),
-                         0,
-                         stream,
-                         static_cast<DT *>(m->devQKVProjArray),
-                         static_cast<DT *>(m->query_activation_buffer),
-                         num_tokens,
-                         m->hidden_size);
-    }
-    // Step 1: compute query-key product QK.T/sqrt(d_k)
-    {
-      // Scale by sqrt(d_k) as per the original attention paper
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // after transpositions
-      int m_ = num_new_tokens;
-      int n = total_tokens;
-      int k = m->qProjSize;
-      // before transpositions
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      // N.B. strides are applied before transpose operations
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens;
-
-      // matrix A: devQKVProjArray
-      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
-      // To get query projection, skip over Q entries from previous requests
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    bc->requestsInfo[i].first_token_offset_in_batch *
-                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-      // matrix B: key cache
-      // matrix B's layout: [kProjSize * num_heads, total_tokens]
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // matrix C: qk_prods
-      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get C, skip over QK.T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
       checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_T,
                                             HIPBLAS_OP_N,
+                                            HIPBLAS_OP_T,
                                             m_,
-                                            n,
-                                            k,
+                                            n_,
+                                            k_,
                                             &alpha,
                                             A,
                                             cublas_data_type,
@@ -1606,177 +1422,111 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                             m->num_q_heads,
                                             compute_type,
                                             HIPBLAS_GEMM_DEFAULT));
-    }
-    // Step 2: Add alibi position bias to qk production
-    // matrix C: qk_prods
-    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-    // To get C, skip over QK.T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods);
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd),
-                         GET_BLOCKS(parallelism),
-                         min((size_t)CUDA_NUM_THREADS, parallelism),
-                         0,
-                         stream,
-                         C,
-                         num_new_tokens,
-                         total_tokens,
-                         m->num_q_heads,
-                         m->global_num_q_heads,
-                         shard_id);
-    }
-
-    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
-    // with -inf to force causal attention.
-    assert(num_new_tokens <= total_tokens);
-    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
-    if (entries_above_diagonal > 0) {
-      size_t parallelism = m->num_q_heads * entries_above_diagonal;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal),
-                         GET_BLOCKS(parallelism),
-                         min((size_t)CUDA_NUM_THREADS, parallelism),
-                         0,
-                         stream,
-                         C,
-                         num_new_tokens,
-                         total_tokens,
-                         m->num_q_heads,
-                         entries_above_diagonal,
-                         static_cast<DT>(-INFINITY));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
     }
 
-    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
+    // Step 7: perform rotary position embeddings (RoPE) bwd
     {
-      // Before modifying the parameters below, make sure to read the following
-      // description of the HIPDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(miopenSet4dTensorDescriptor(
-          m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
-                                         &softmax_alpha,
-                                         m->qk_tensor,
-                                         C,
-                                         &softmax_beta,
-                                         m->qk_tensor,
-                                         C_softmax,
-                                         MIOPEN_SOFTMAX_ACCURATE,
-                                         MIOPEN_SOFTMAX_MODE_CHANNEL));
-    }
-    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
-    // PEFT backward
-    if (bc->requestsInfo[i].peft_bwd) {
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      size_t activation_size_needed =
-          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
-      if (activation_size_needed > m->allocated_peft_buffer_size2) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->softmax_activation_buffer =
-            allocator->allocate_instance_untyped(activation_size_needed);
-        m->allocated_peft_buffer_size2 = activation_size_needed;
+      if (m->rotary_embedding_meta->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(apply_rotary_embedding_bwd),
+            GET_BLOCKS(parallelism),
+            min(CUDA_NUM_THREADS, parallelism),
+            0,
+            stream,
+            A,
+            m->complex_input,
+            m->token_infos,
+            m->rotary_embedding_meta->rope_theta,
+            (m->rotary_embedding_meta->rope_type == "llama3"),
+            m->rotary_embedding_meta->factor,
+            m->rotary_embedding_meta->low_freq_factor,
+            m->rotary_embedding_meta->high_freq_factor,
+            m->rotary_embedding_meta->original_max_position_embeddings,
+            m->qProjSize,
+            num_tokens,
+            m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
       }
-      checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer,
-                               C_softmax,
-                               sizeof(DT) * total_tokens * num_new_tokens *
-                                   m->num_q_heads,
-                               hipMemcpyDeviceToDevice,
-                               stream));
     }
-    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
-    // softmax(QK.T/sqrt(d_k)).T
+
+    // Step 8: compute gradients w.r.t. input
     {
-      DT alpha = 1.0f, beta = 0.0f;
-      // after transpositions
-      int m_ = m->vProjSize;
-      int n = num_new_tokens;
-      int k = total_tokens;
-      // before transpositions
-      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      // N.B. strides are applied before transpose operations
-      int strideA = vt_block_size;
-      int strideB = num_new_tokens * total_tokens;
-      int strideC = m->vProjSize;
-      // matrix A: value cache
-      // matrix A's layout: [vProjSize, num_heads, total_tokens]
-      // To get A, skip over V.T entries from previous requests (all heads +
-      // padding)
-      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // matrix B: qk_prods_softmax
-      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      // matrix C: attn heads
-      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
-      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
-      // requests
-      // store the result attn heads, also skip the genration tokens
-      DT *C = static_cast<DT *>(m->attn_heads) +
-              (bc->requestsInfo[i].first_token_offset_in_batch) *
-                  m->num_q_heads * m->vProjSize;
-      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_N,
-                                            HIPBLAS_OP_T,
-                                            m_,
-                                            n,
-                                            k,
-                                            &alpha,
-                                            A,
-                                            cublas_data_type,
-                                            lda,
-                                            strideA,
-                                            B,
-                                            cublas_data_type,
-                                            ldb,
-                                            strideB,
-                                            &beta,
-                                            C,
-                                            cublas_data_type,
-                                            ldc,
-                                            strideC,
-                                            m->num_q_heads,
-                                            compute_type,
-                                            HIPBLAS_GEMM_DEFAULT));
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [m->qSize, num_tokens]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      // int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+
+      // The original version uses existing result and attention's projection to
+      // do further calculation in a way different than the usual dense layer,
+      // they are off by a transpose. So an explicit transpose is needed here.
+      // The add here is just for gradient accumulation.
+      transposeAdd(C, B, n_, k_, alpha, beta, stream);
+
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
     }
-    tokens_previous_requests += num_new_tokens;
-  }
-  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
-    bc->print();
-    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-    printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
   }
-  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
 }
 
+} // namespace IncMultiHeadAttention
+} // namespace Kernels
+
+using namespace Kernels::IncMultiHeadAttention;
+
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   hipEvent_t t_start, t_end;
   if (m->profiling) {
@@ -1785,43 +1535,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
-    }
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::IncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
-        output.get_half_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
   } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::IncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -1843,12 +1564,9 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorW const &input_grad,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorR const &output_grad,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorR const &output_grad) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   hipEvent_t t_start, t_end;
   if (m->profiling) {
@@ -1857,35 +1575,23 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input_grad.data_type == output_grad.data_type);
-  if (use_bias) {
-    assert(input_grad.data_type == bias.data_type);
-  }
 
   if (input_grad.data_type == DT_HALF) {
     assert(!m->offload);
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
                                                     bc,
                                                     shard_id,
                                                     input_grad.get_half_ptr(),
-                                                    weight.get_half_ptr(),
                                                     output_grad.get_half_ptr(),
-                                                    bias_ptr,
                                                     stream);
   } else if (input_grad.data_type == DT_FLOAT) {
     assert(!m->offload);
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
                                                     bc,
                                                     shard_id,
                                                     input_grad.get_float_ptr(),
-                                                    weight.get_float_ptr(),
                                                     output_grad.get_float_ptr(),
-                                                    bias_ptr,
                                                     stream);
   } else {
     assert(false && "Unspported data type");
@@ -1904,7 +1610,6 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _num_q_heads,
@@ -1919,14 +1624,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
+                                    attn->rotary_embedding_meta,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
                                     attn->position_bias,
-                                    attn->final_bias,
                                     attn->scaling_factor,
-                                    weight,
                                     gpu_mem_allocator,
                                     num_samples,
                                     attn->num_q_heads,
@@ -1947,14 +1649,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _kProjSize,
     int _vProjSize,
     int _oProjSize,
-    bool _apply_rotary_embedding,
-    bool _qkv_bias,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool _final_bias,
     float _scaling_factor,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _global_num_q_heads,
@@ -1963,7 +1662,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _num_kv_heads,
     DataType _quantization_type,
     bool _offload)
-    : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) {
+    : OpMeta(handler, attn) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
@@ -1989,29 +1688,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   num_kv_heads = _num_kv_heads;
   hidden_size = num_q_heads * qProjSize;
 
-  weightSize =
-      ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
-           num_q_heads +
-       (kSize * kProjSize + vSize * vProjSize) * num_q_heads) *
-      size_of_dt;
-  if (quantization_type != DT_NONE) {
-    quantized_weightSize = get_quantization_to_byte_size(
-        attn->data_type, quantization_type, weightSize);
-  }
-  // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0;
-
-  int qkv_bias_size =
-      qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-  int final_bias_size = oProjSize;
-  biasSize =
-      (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0);
-
-  // has_load_weights = (bool *)calloc(1, sizeof(bool));
-  //*has_load_weights = false;
-  apply_rotary_embedding = (bool *)calloc(1, sizeof(bool));
-  *apply_rotary_embedding = _apply_rotary_embedding;
-  qkv_bias = (bool *)calloc(1, sizeof(bool));
-  *qkv_bias = _qkv_bias;
+  rotary_embedding_meta =
+      (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
+  *rotary_embedding_meta = _rotary_embedding_meta;
   scaling_query = (bool *)calloc(1, sizeof(bool));
   *scaling_query = _scaling_query;
   scaling_factor = _scaling_factor;
@@ -2019,14 +1698,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   *qk_prod_scaling = _qk_prod_scaling;
   position_bias = (bool *)calloc(1, sizeof(bool));
   *position_bias = _position_bias;
-  final_bias = (bool *)calloc(1, sizeof(bool));
-  *final_bias = _final_bias;
-
-  // allocate weight and bias in the reserve space for cpu offloading
-  if (offload) {
-    weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(weightSize);
-    bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize);
-  }
 
   // allocate memory for the seqArray and reserve space
   {
@@ -2092,9 +1763,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                ? key_cache_size + value_cache_size + qkv_max_proj_size
                : key_cache_size + value_cache_size);
 
-      if (quantization_type != DT_NONE) {
-        totalSharedSize += quantized_weightSize;
-      }
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              totalSharedSize);
@@ -2125,29 +1793,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         handler.batch_config_metadata->requestsInfo);
 
     if (offload) {
-      // token_infos =
-      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
-      //         tokeninfo_size);
-      // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size;
       qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
                                                              size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
       qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped(
           qk_prod_size * size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
       attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
                                                                size_of_dt);
-      // offset += attn_heads_size * size_of_dt;
       complex_input =
           gpu_mem_allocator.allocate_reserved<hipFloatComplex>(complex_size);
-      // offset += complex_size * sizeof(hipFloatComplex);
-      // request_infos =
-      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerRequestInfo>(
-      //         requestinfo_size);
     } else {
-      // token_infos =
-      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
-      //         tokeninfo_size);
       qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
@@ -2156,16 +1810,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                                size_of_dt);
       complex_input =
           gpu_mem_allocator.allocate_instance<hipFloatComplex>(complex_size);
-      // request_infos =
-      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerRequestInfo>(
-      //         requestinfo_size);
     }
 
     // allocate more size for quantization data
     if (quantization_type != DT_NONE) {
       assert(offload);
-      quantized_weight_ptr =
-          gpu_mem_allocator.allocate_reserved<char>(quantized_weightSize);
     }
     if (!offload) {
       assert(gpu_mem_allocator.reserved_total_size ==
@@ -2183,49 +1832,32 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {
   }
 }
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    hipStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        float *output_ptr,
+        hipStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    hipStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        half *output_ptr,
+        hipStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     int shard_id,
     float *output_ptr,
-    float const *weight_ptr,
-    float const *bias_ptr,
-    int num_tokens,
     hipStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     int shard_id,
     half *output_ptr,
-    half const *weight_ptr,
-    half const *bias_ptr,
-    int num_tokens,
     hipStream_t stream);
 
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        float *output_ptr,
-        hipStream_t stream);
-
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        half *output_ptr,
-        hipStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b278611b60..2802dd41b6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -19,6 +19,7 @@
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/cuda_helper.h"
+#include <math_constants.h>
 
 namespace FlexFlow {
 
@@ -31,1075 +32,162 @@ using Legion::Memory;
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
-// gridDim = num_heads
-// blockDim = num_tokens/num_request * head_size
-// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
-// one thread process one head_size
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_generation_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int max_seq_length,
-    int per_head_size,
-    int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
-  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
-  //  K_ELTS_PER_THREAD = 128 / 4 = 32
-  //  K_VECS_PER_THREAD = 32 / 1 = 32
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
-  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
-
-  int const first_step = 0;
+template <typename DT>
+__global__ void store_kv_cache(DT const *devQKVProjArray,
+                               DT *kCache_ptr,
+                               DT *vCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int num_tokens,
+                               int max_seq_len,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
 
-  int const tlength =
-      request_infos[batch_config_request_id].first_token_depth_in_request +
-      request_infos[batch_config_request_id].num_tokens_in_batch;
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
 
-  // shared memory objects
-  extern __shared__ char smem_[];
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+    int const req_id = tokenInfos[token_idx].request_index;
+    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
 
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
+    // key cache
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = vVal;
+  }
+}
 
-  float qk_max = -FLT_MAX;
+template <typename DT>
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
 
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
 
-  const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-  // DT const *q_ptr =
-  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
+    DT qVal = devQKVProjArray[val_idx];
 
-  // q tensor in this thread
-  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
-  // K_VECS_PER_THREAD elements
-  // QK_vec_k: 32->1, 64->2, 128->4... head_size
-  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
+    // query cache
+    qCache_ptr[i] = qVal;
+  }
+}
 
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-#pragma unroll
-  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
   }
-  __syncthreads();
-  // first iter = 128 / 4 = 32
-  // K_VECS_PER_THREAD = 32
-  //  K_PER_ITER how many keys in this loop
-  //  The number of timesteps loaded per iteration.
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  //   // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+}
 
-  DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+template <typename DT>
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
+                                     BatchConfig const *bc,
+                                     int shard_id,
+                                     cudaStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  cudaDataType_t compute_type = cublas_data_type;
 
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-  // get k, perform qk proj
+  int num_tokens = bc->num_active_tokens();
+  int tokens_previous_requests = 0;
+  int q_block_size = m->qProjSize;
+  int kt_block_size = m->kProjSize;
+  int kt_req_block_size =
+      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+  int vt_block_size = m->vProjSize;
+  int vt_req_block_size =
+      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+  assert(m->qProjSize == m->kProjSize);
 
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    K_vec k[K_VECS_PER_THREAD];
-    int const ti_circ = ti % max_seq_length;
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-      if (ti < tlength) {
-        k[ii] = *reinterpret_cast<K_vec const *>(k_cache_batch +
-                                                 ti_circ * hidden_size +
-                                                 head_idx * per_head_size + jj);
-      }
-      // Compute dot product.
-      // This includes a reduction across the threads in the same thread group.
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
+      continue;
     }
-    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-    // // todo add positional embedding to the qk production
-    // // Store the product to shared memory. There's one qk value per
-    // timestep.
-    // // Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      // todo add alobi here
-      bool const mask = ti_circ >= tlength;
-      if (mask) {
-        assert(false);
+    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                       bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
       }
-      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-      qk_smem[ti - first_step] = mask ? 0.f : qk;
+      int parallelism = m->hidden_size * num_tokens;
+      store_query_cache<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(
+          static_cast<DT *>(m->devQKVProjArray),
+          static_cast<DT *>(m->query_activation_buffer),
+          num_tokens,
+          m->hidden_size);
     }
-  }
-
-  __syncthreads();
-
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Decompose the thread index into warp and lane.
-  int const warp = tidx / WARP_SIZE;
-  int const lane = tidx % WARP_SIZE;
+    // Step 1: compute query-key product QK.T/sqrt(d_k)
+    {
+      // Scale by sqrt(d_k) as per the original attention paper
+      DT alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+      }
+      // after transpositions
+      int m_ = num_new_tokens;
+      int n = total_tokens;
+      int k = m->qProjSize;
+      // before transpositions
+      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
+          ldc = m_;
+      // N.B. strides are applied before transpose operations
+      int strideA = q_block_size;
+      int strideB = kt_block_size;
+      int strideC = num_new_tokens * total_tokens;
 
-  // The warp leader writes the max to shared memory.
-  if (lane == 0) {
-    red_smem[warp] = qk_max;
-  }
-
-  // Make sure the products are in shared memory.
-  __syncthreads();
-
-  // The warps finalize the reduction.
-  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Broadcast to all the threads in the warp.
-  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-  float exp_sum = 0.f;
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    float logit = __expf(qk_smem[ti - first_step] - qk_max);
-    exp_sum += logit;
-    qk_smem[ti - first_step] = logit;
-  }
-
-  // Compute the sum.
-  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-  // softmax
-  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    qk_smem[ti - first_step] *= inv_sum;
-  }
-
-  __syncthreads();
-  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
-  //   printf("softmax %.10f\n", qk_smem[0]);
-  // }
-
-  // value projection
-  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-  // A vector of V elements for the current timestep.
-  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-  // The value computed by this thread.
-  int vo = tidx / THREADS_PER_VALUE;
-  // The hidden dimensions computed by this particular thread.
-  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-  Out_sum out;
-  zero(out);
-
-  // The base pointer for the value in the cache buffer.
-  DT const *v_cache_batch =
-      value_cache + batch_config_request_id * max_seq_length * hidden_size + vi;
-
-  if (Dh == Dh_MAX || vi < Dh) {
-    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-      // Load the values from the cache.
-      int const ti_circ = ti % max_seq_length;
-
-      V_vec v = *reinterpret_cast<V_vec const *>(
-          v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-      float logit = qk_smem[ti - first_step];
-      out = FlexFlow::fma(logit, cast_to_float(v), out);
-    }
-  }
-
-  //   // Make sure we can start writing to shared memory.
-  __syncthreads();
-
-  // Run the final reduction amongst the different groups computing different
-  // partial outputs.
-  if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-    for (int active_groups = V_PER_ITER; active_groups >= 2;
-         active_groups /= 2) {
-
-      // The midpoint in the number of active groups.
-      int midpoint = active_groups / 2;
-
-      // The upper part of active threads store to shared memory.
-      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-            out;
-      }
-      __syncthreads();
-
-      // The bottom warps update their values.
-      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                  out);
-      }
-      __syncthreads();
-    }
-  }
-
-  // Output the final values.
-  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-    convert_from_float(
-        *reinterpret_cast<V_vec *>(output_ptr + request_idx * hidden_size +
-                                   head_idx * per_head_size + vi),
-        out);
-  }
-}
-
-// only used by MPT model. https://arxiv.org/abs/2108.12409
-template <typename DT>
-__global__ void apply_position_bias_qkprd(DT *input_ptr,
-                                          int num_tokens,
-                                          int num_total_tokens,
-                                          int num_heads,
-                                          int global_num_q_heads,
-                                          int shard_id) {
-  CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) {
-    // get head_idx,
-    int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id);
-    int position_idx = (i / num_tokens) % num_total_tokens;
-    position_idx = position_idx + 1 - num_total_tokens;
-    // 8 is alibi_bias_max in
-    // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json
-    float base = (float)(head_idx + 1) * 8 / global_num_q_heads;
-    float slopes = 1.0 / pow(2, base);
-    // if(i == 0){
-    //   printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes,
-    //   position_idx * slopes);
-    // }
-    input_ptr[i] += static_cast<DT>(position_idx * slopes);
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize) {
-  CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) {
-    int bias_idx = qkv_weight_size + i % oProjSize;
-    input_ptr[i] += bias_ptr[bias_idx];
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int global_num_q_heads,
-                                    int num_q_heads,
-                                    bool scaling_query,
-                                    float scaling_factor,
-                                    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
-    // for simplicity, assume q, k, v is in same shape
-    // 0->q, 1->k, 2->v
-    // int qkv_index = i / (num_tokens * qProjSize) % 3;
-
-    int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
-    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
-
-    int qkv_index = in_token_idx / hidden_size;
-
-    int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
-
-    int head_idx =
-        (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
-    int global_head_idx = head_idx + shard_id * num_q_heads;
-
-    size_t pre_length =
-        qkv_index == 0
-            ? 0
-            : (qkv_index == 1 ? qProjSize * global_num_q_heads
-                              : qProjSize * global_num_q_heads * KV_WEIGHT_NUM);
-
-    size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
-
-    input_ptr[i] += bias_ptr[bias_idx];
-
-    if (scaling_query && qkv_index == 0) {
-      input_ptr[i] *= scaling_factor;
-    }
-  }
-}
-
-template <typename DT>
-__global__ void scaling_query_kernel(DT *input_ptr,
-                                     int qProjSize,
-                                     int num_tokens,
-                                     int num_q_heads,
-                                     float scaling_factor,
-                                     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *=
-        scaling_factor;
-  }
-}
-
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  cuFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    // complex_input[i].y;
-
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_hf(DT *input_ptr,
-                              cuFloatComplex *complex_input,
-                              BatchConfig::PerTokenInfo const *tokenInfos,
-                              int qProjSize,
-                              int kProjSize,
-                              int num_tokens,
-                              size_t q_array_size,
-                              int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int token_idx = real_i / (hidden_size / 2);
-    int idx = real_i % (proj_size / 2);
-    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
-
-    int real_part_index = idx + head_idx * proj_size +
-                          token_idx * hidden_size * QKV_WEIGHT_NUM +
-                          hidden_size * (q_tensor ? 0 : 1);
-    int complex_part_index = real_part_index + (proj_size / 2);
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
-    // apply a Cartesian coordinate transformation
-    // multiple with input & /copy back to q/k
-
-    // get position of token
-
-    // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_bwd(DT *input_ptr,
-                               cuFloatComplex *complex_input,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int proj_size,
-                               int num_tokens,
-                               int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    // compute indexes to visit first half proj_size of each of q/k tensor.
-    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
-    bool q_tensor = i < (num_tokens * hidden_size / 2);
-    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
-    assert(hidden_size % proj_size == 0);
-    int num_heads = hidden_size / proj_size;
-
-    int token_idx = real_i % num_tokens;
-    int idx = (real_i / num_tokens) % (proj_size / 2);
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    assert(head_idx < num_heads);
-
-    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
-                             head_idx * num_tokens * proj_size +
-                             idx * num_tokens + token_idx;
-    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
-template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
-  }
-}
-
-template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        cudaStream_t stream) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  assert(m->qSize == m->vSize && m->qSize == m->kSize);
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudaDataType_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-
-  // Step 1: Compute QKV projections
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_q = m->qProjSize * m->num_q_heads;
-    int m_k = m->kProjSize * m->num_q_heads;
-    int m_v = m->vProjSize * m->num_q_heads;
-    assert(m_q == m_k && m_k == m_v); // keep things simple for now
-    int n = bc->num_active_infr_tokens();
-    int k = m->qSize;
-    int m_ = m_q * QKV_WEIGHT_NUM;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: QKV weights
-    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
-    // matrix B: input
-    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
-    // matrix C: devQKVProjArray
-    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           weight_ptr,
-                           cublas_data_type,
-                           lda,
-                           input_ptr,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           output_ptr,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-
-  int num_tokens = bc->num_active_tokens();
-  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
-  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
-
-  // Step 2: apply bias for QKV, or scale the query
-  if (*m->qkv_bias) {
-    apply_proj_bias_qkv<<<GET_BLOCKS(parallelism),
-                          min(CUDA_NUM_THREADS, parallelism),
-                          0,
-                          stream>>>(output_ptr,
-                                    bias_ptr,
-                                    shard_id,
-                                    num_tokens,
-                                    m->qProjSize,
-                                    m->kProjSize,
-                                    m->vProjSize,
-                                    m->global_num_q_heads,
-                                    m->num_q_heads,
-                                    *m->scaling_query,
-                                    m->scaling_factor,
-                                    m->hidden_size);
-  } else if (m->scaling_query) {
-    scaling_query_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(output_ptr,
-                                     num_tokens,
-                                     m->num_q_heads,
-                                     m->qProjSize,
-                                     m->scaling_factor,
-                                     m->hidden_size);
-  }
-
-  // Step 3: apply rotary embedding if needed
-  if (*m->apply_rotary_embedding) {
-    /*q&k*/
-    parallelism = num_tokens * m->hidden_size;
-    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
-                                min(CUDA_NUM_THREADS, parallelism),
-                                0,
-                                stream>>>(output_ptr,
-                                          m->complex_input,
-                                          m->token_infos,
-                                          m->qProjSize,
-                                          m->kProjSize,
-                                          num_tokens,
-                                          q_array_size,
-                                          m->hidden_size);
-  }
-}
-
-template <typename DT>
-void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                            BatchConfig const *bc,
-                            cudaStream_t stream) {
-  int num_tokens = bc->num_active_infr_tokens();
-  if (num_tokens > 0) {
-    int parallelism = m->hidden_size * num_tokens;
-    store_kv_cache<<<GET_BLOCKS(parallelism),
-                     min(CUDA_NUM_THREADS, parallelism),
-                     0,
-                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                               static_cast<DT *>(m->keyCache),
-                               static_cast<DT *>(m->valueCache),
-                               m->token_infos,
-                               num_tokens,
-                               BatchConfig::max_sequence_length(),
-                               m->hidden_size);
-  }
-}
-
-template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         cudaStream_t stream) {
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
-#endif
-  // Project to output, save result directly on output tensor
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_ = m->oProjSize;
-    int k = m->vProjSize * m->num_q_heads;
-    int n = num_tokens;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: output projection weight
-    // matrix A's layout: [vProjSize * num_heads, oProjSize]
-    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                           m->kProjSize * m->num_q_heads +
-                                           m->vProjSize * m->num_q_heads);
-    // matrix B: attn heads
-    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
-    DT const *B = static_cast<DT *>(m->attn_heads);
-    // matrix B: output
-    // matrix B's layout: [oProjSize, num_new_tokens]
-    DT *C = static_cast<DT *>(output_ptr);
-
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           A,
-                           cublas_data_type,
-                           lda,
-                           B,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           C,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-  // Add final output bias
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(
-        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
-  }
-}
-
-#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
-                                   BatchConfig::max_sequence_length(),         \
-                                   THREADS_PER_VALUE,                          \
-                                   THDS_PER_BLOCK);                            \
-  compute_attention_kernel_generation_kernel<DT,                               \
-                                             THDS_PER_BLOCK,                   \
-                                             Dh,                               \
-                                             Dh_MAX,                           \
-                                             THDS_PER_KEY,                     \
-                                             THREADS_PER_VALUE>                \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length(),                                  \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos)
-
-template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         cudaStream_t stream) {
-  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  size_t smem_sz;
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
-  } else {
-    assert(false && "a unsupported head size");
-  }
-}
-
-template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             cudaStream_t stream) {
-  // additional processing for weight uploading
-  // Note that we update weight_ptr and bias_ptr when uploading weight and
-  // bias
-  if (m->quantization_type != DT_NONE) {
-    // copy weight_ptr to quantized_weight_ptr, do compression and store in
-    // m->weight_ptr
-    cudaMemcpyAsync(m->quantized_weight_ptr,
-                    weight.get_byte_ptr(),
-                    m->quantized_weightSize,
-                    cudaMemcpyHostToDevice,
-                    stream);
-
-    if (m->quantization_type == DT_INT4) {
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
-      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
-    } else {
-      assert(m->quantization_type == DT_INT8);
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
-      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
-    }
-  } else {
-    if (data_type == DT_FLOAT) {
-      cudaMemcpyAsync(m->weight_ptr,
-                      weight.get_float_ptr(),
-                      m->weightSize,
-                      cudaMemcpyHostToDevice,
-                      stream);
-    } else if (data_type == DT_HALF) {
-      cudaMemcpyAsync(m->weight_ptr,
-                      weight.get_half_ptr(),
-                      m->weightSize,
-                      cudaMemcpyHostToDevice,
-                      stream);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
-                      BatchConfig const *bc,
-                      int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
-                      DT *output_ptr,
-                      DT const *bias_ptr,
-                      cudaStream_t stream) {
-
-  if (m->offload && m->biasSize > 0) {
-    cudaMemcpyAsync(
-        m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
-    bias_ptr = static_cast<DT *>(m->bias_ptr);
-  }
-
-  // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
-  update_kv_cache_kernel<DT>(m, bc, stream);
-
-  if (bc->num_generation_tokens > 0) {
-    // phase 3: Compute attention score for generation tokens
-    compute_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
-  }
-
-  if (bc->num_tokens > bc->num_generation_tokens) {
-    // phase 4: Compute attention score for prompt tokens;
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, bias_ptr, weight_ptr, stream);
-  }
-
-  // compute output production and bias together for all tokens
-  int num_tokens = bc->num_active_tokens();
-  compute_o_prod_bias(
-      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
-}
-
-std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
-                                int shard_id) {
-  std::string op_name_without_uid =
-      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
-  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
-  if (m->layer_guid.model_id > 0) {
-    assert(false && "Model ID > 0 not supported yet");
-  }
-  std::string layername = "layers." +
-                          std::to_string(m->layer_guid.transformer_layer_id) +
-                          "." + op_name_without_uid;
-  dst_filepath /= layername;
-  return dst_filepath.string();
-}
-
-template <typename DT>
-void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                     BatchConfig const *bc,
-                     int shard_id,
-                     DT *input_grad_ptr,
-                     DT const *weight_ptr,
-                     DT const *output_grad_ptr,
-                     DT const *bias_ptr,
-                     cudaStream_t stream) {
-  assert(!m->offload);
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-  cudaDataType_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    if (!bc->requestsInfo[i].peft_bwd) {
-      continue;
-    }
-    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
-                           bc->requestsInfo[i].num_tokens_in_batch;
-    // Currently assume we are calculating gradients for all tokens
-    // of a request
-    assert(num_tokens == num_total_tokens);
-    int kt_block_size = m->kProjSize;
-    int kt_req_block_size =
-        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-    int vt_block_size = m->vProjSize;
-    int vt_req_block_size =
-        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
-    // Step 1: compute gradients before final projection
-    {
-      int m_ = m->vProjSize * m->num_q_heads;
-      int n_ = num_tokens;
-      int k_ = m->oProjSize;
-      int lda = m_;
-      int ldb = k_;
-      int ldc = m_;
-      float alpha = 1.0f, beta = 0.0f;
-      // matrix A: output projection weight
-      // matrix A's layout: [vProjSize * num_heads, oProjSize]
-      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                             m->kProjSize * m->num_q_heads +
-                                             m->vProjSize * m->num_q_heads);
-      // matrix B: output gradients
-      // matrix B's layout: [oProjSize, num_new_tokens]
-      DT const *B =
-          output_grad_ptr +
-          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
-      // matrix C: attn_heads gradients
-      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
-      DT *C = static_cast<DT *>(m->handle.workSpace);
-      checkCUDA(cublasGemmEx(m->handle.blas,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             m_,
-                             n_,
-                             k_,
-                             &alpha,
-                             A,
-                             cublas_data_type,
-                             lda,
-                             B,
-                             cublas_data_type,
-                             ldb,
-                             &beta,
-                             C,
-                             cublas_data_type,
-                             ldc,
-                             compute_type,
-                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        // save result to file for checking
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0";
-        save_tensor(C, m_ * n_, filename.c_str());
-      }
-    }
-    // Step 2: compute gradients w.r.t. value
-    {
-      float alpha = 1.0f, beta = 0.0f;
-      // matrix A: qk_prods_softmax
-      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
-      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
-      // matrix B: attn_heads gradients
-      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
-      DT const *B = static_cast<DT *>(m->handle.workSpace);
-      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) +
-              2 * num_tokens *
-                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
-                                                   // for Q and K gradients
-      // after transpositions
-      int m_ = num_tokens;   // total_tokens
-      int n_ = m->vProjSize; // num_new_tokens
-      int k_ = num_tokens;   // num_new_tokens
-      // before transpositions
-      int lda = num_tokens; // num_new_tokens
-      int ldb = m->vProjSize * m->num_q_heads;
-      int ldc = num_tokens; // total_tokens
-      // N.B. strides are applied before transpose operations
-      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
-      int strideB = m->vProjSize;
-      int strideC = num_tokens * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n_,
-                                           k_,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // save result to file for checking
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0";
-        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
-        std::string filename2 =
-            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax";
-        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
-      }
-    }
-    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
-    {
-      float alpha = 1.0f, beta = 0.0f;
-      // matrix A: attn_heads gradients
-      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
-      DT const *A = static_cast<DT *>(m->handle.workSpace);
-      // matrix B: value cache
-      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
-      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // matrix C: qk_prods_softmax gradients
+      // matrix A: devQKVProjArray
+      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
+      // To get query projection, skip over Q entries from previous requests
+      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                    bc->requestsInfo[i].first_token_offset_in_batch *
+                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
+      // matrix B: key cache
+      // matrix B's layout: [kProjSize * num_heads, total_tokens]
+      // To get B, skip over K entries from previous requests (all heads +
+      // padding)
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: qk_prods
       // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-      DT *C = static_cast<DT *>(m->qk_prods_softmax);
-      // after transposition & striding
-      int m_ = num_tokens; // num_new_tokens
-      int n_ = num_tokens;
-      int k_ = m->vProjSize;
-      // before transposition and striding
-      int lda = m->vProjSize * m->num_q_heads;
-      int ldb = m->vProjSize * m->num_q_heads;
-      int ldc = num_tokens; // num_new_tokens
-      int strideA = m->vProjSize;
-      int strideB = m->vProjSize;
-      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
-
+      // To get C, skip over QK.T products from previous requests
+      DT *C = static_cast<DT *>(m->qk_prods);
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                            CUBLAS_OP_T,
                                            CUBLAS_OP_N,
                                            m_,
-                                           n_,
-                                           k_,
+                                           n,
+                                           k,
                                            &alpha,
                                            A,
                                            cublas_data_type,
@@ -1117,23 +205,57 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad";
-        save_tensor(
-            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
-        std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache";
-        save_tensor(
-            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
-      }
     }
-    // Step 4: softmax backpropagation
+    // Step 2: Add alibi position bias to qk production
+    // matrix C: qk_prods
+    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+    // To get C, skip over QK.T products from previous requests
+    DT *C = static_cast<DT *>(m->qk_prods);
+    if (*m->position_bias) {
+      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
+      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
+                                  min((size_t)CUDA_NUM_THREADS, parallelism),
+                                  0,
+                                  stream>>>(C,
+                                            num_new_tokens,
+                                            total_tokens,
+                                            m->num_q_heads,
+                                            m->global_num_q_heads,
+                                            shard_id);
+    }
+
+    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
+    // with -inf to force causal attention.
+    assert(num_new_tokens <= total_tokens);
+    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
+    if (entries_above_diagonal > 0) {
+      size_t parallelism = m->num_q_heads * entries_above_diagonal;
+      fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
+                                    min((size_t)CUDA_NUM_THREADS, parallelism),
+                                    0,
+                                    stream>>>(C,
+                                              num_new_tokens,
+                                              total_tokens,
+                                              m->num_q_heads,
+                                              entries_above_diagonal,
+                                              static_cast<DT>(-INFINITY));
+    }
+
+    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
     {
-      float alpha = 1.0f, beta = 0.0f;
+      // Before modifying the parameters below, make sure to read the following
+      // description of the CUDNN_TENSOR_NCHW tensor layout, from
+      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
+      // This tensor format specifies that the data is laid out in the following
+      // order: batch size, feature maps, rows, columns. The strides are
+      // implicitly defined in such a way that the data are contiguous in memory
+      // with no padding between images, feature maps, rows, and columns; the
+      // columns are the inner dimension and the images are the outermost
+      // dimension.
       int n_param = m->num_q_heads;
-      int c_param = num_tokens;
+      int c_param = total_tokens;
       int h_param = 1;
-      int w_param = num_tokens;
+      int w_param = num_new_tokens;
       checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
                                             CUDNN_TENSOR_NCHW,
                                             cudnn_data_type,
@@ -1141,85 +263,79 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                             c_param,
                                             h_param,
                                             w_param));
-      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
-                                      CUDNN_SOFTMAX_ACCURATE,
-                                      CUDNN_SOFTMAX_MODE_CHANNEL,
-                                      &alpha,
-                                      m->qk_tensor,
-                                      m->softmax_activation_buffer,
-                                      m->qk_tensor,
-                                      m->qk_prods_softmax,
-                                      &beta,
-                                      m->qk_tensor,
-                                      m->qk_prods));
-
-      if (m->inference_debugging) {
-        DT *C = static_cast<DT *>(m->qk_prods);
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in";
-        save_tensor(
-            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
-      }
-
-      //  TODO: fill all elements above diagonal to force causal attention
-      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
-      if (entries_above_diagonal > 0) {
-        size_t parallelism = m->num_q_heads * entries_above_diagonal;
-        fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                      min((size_t)CUDA_NUM_THREADS,
-                                          parallelism),
-                                      0,
-                                      stream>>>(static_cast<DT *>(m->qk_prods),
-                                                num_tokens,
-                                                num_tokens,
-                                                m->num_q_heads,
-                                                entries_above_diagonal,
-                                                DT(0.0f));
-      }
-      if (m->inference_debugging) {
-        DT *C = static_cast<DT *>(m->qk_prods);
-        std::string filename = get_peft_dbg_folder(m, shard_id) +
-                               ".qk_prods.softmax_grad_in.masked";
-        save_tensor(
-            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      // The softmax operation below is executed according to the
+      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+      // softmax operation is computed per spatial location (H,W) per image (N)
+      // across dimension C.
+      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
+                                     CUDNN_SOFTMAX_ACCURATE,
+                                     CUDNN_SOFTMAX_MODE_CHANNEL,
+                                     &softmax_alpha,
+                                     m->qk_tensor,
+                                     C,
+                                     &softmax_beta,
+                                     m->qk_tensor,
+                                     C_softmax));
+    }
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
       }
+      checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer,
+                                C_softmax,
+                                sizeof(DT) * total_tokens * num_new_tokens *
+                                    m->num_q_heads,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
     }
-    // Step 5: compute gradients w.r.t. key
+    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
+    // softmax(QK.T/sqrt(d_k)).T
     {
-      float alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = 1.0f / sqrt(m->kProjSize);
-      }
-      // matrix A: gradients w.r.t. qk_prods
-      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
-      DT const *A = static_cast<DT *>(m->qk_prods);
-      // matrix B: query activation (in query_activation_buffer)
-      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
-      DT const *B = static_cast<DT *>(m->query_activation_buffer);
-      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C =
-          static_cast<DT *>(m->devQKVProjArray) +
-          num_tokens *
-              (m->qProjSize *
-               m->num_q_heads); // skip over regions reserved for Q gradients
-      // after transposition & striding
-      int m_ = num_tokens;
-      int n_ = m->kProjSize;
-      int k_ = num_tokens; // num_new_tokens
-      // before transposition and striding
-      int lda = num_tokens; // num_new_tokens
-      int ldb = m->kProjSize * m->num_q_heads;
-      int ldc = num_tokens;
-      int strideA = num_tokens * num_tokens;
-      int strideB = m->kProjSize;
-      int strideC = num_tokens * m->kProjSize;
+      DT alpha = 1.0f, beta = 0.0f;
+      // after transpositions
+      int m_ = m->vProjSize;
+      int n = num_new_tokens;
+      int k = total_tokens;
+      // before transpositions
+      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      // N.B. strides are applied before transpose operations
+      int strideA = vt_block_size;
+      int strideB = num_new_tokens * total_tokens;
+      int strideC = m->vProjSize;
+      // matrix A: value cache
+      // matrix A's layout: [vProjSize, num_heads, total_tokens]
+      // To get A, skip over V.T entries from previous requests (all heads +
+      // padding)
+      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix B: qk_prods_softmax
+      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      DT *B = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix C: attn heads
+      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
+      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
+      // requests
+      // store the result attn heads, also skip the genration tokens
+      DT *C = static_cast<DT *>(m->attn_heads) +
+              (bc->requestsInfo[i].first_token_offset_in_batch) *
+                  m->num_q_heads * m->vProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
                                            CUBLAS_OP_T,
                                            m_,
-                                           n_,
-                                           k_,
+                                           n,
+                                           k,
                                            &alpha,
                                            A,
                                            cublas_data_type,
@@ -1237,323 +353,797 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".query_activation";
-        save_tensor(
-            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
-        std::string filename2 =
-            get_peft_dbg_folder(m, shard_id) + ".devkproj_pre";
-        save_tensor(
-            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
+    }
+    tokens_previous_requests += num_new_tokens;
+  }
+  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+    bc->print();
+    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
+    printf("num_tokens: %i\n", num_tokens);
+    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
+  }
+  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
+}
+
+// gridDim = num_heads
+// blockDim = num_tokens/num_request * head_size
+// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
+// one thread process one head_size
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_attention_kernel_generation_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int max_seq_length,
+    int per_head_size,
+    int hidden_size,
+    BatchConfig::PerRequestInfo *request_infos) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
+  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
+  //  K_ELTS_PER_THREAD = 128 / 4 = 32
+  //  K_VECS_PER_THREAD = 32 / 1 = 32
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
+  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // request idx
+  int const request_idx = blockIdx.y;
+
+  int const batch_config_request_id =
+      request_infos[request_idx].batch_config_request_id;
+
+  int const first_step = 0;
+
+  int const tlength =
+      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+  // DT const *q_ptr =
+  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
+
+  // q tensor in this thread
+  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
+  // K_VECS_PER_THREAD elements
+  // QK_vec_k: 32->1, 64->2, 128->4... head_size
+  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+#pragma unroll
+  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
+  }
+  __syncthreads();
+  // first iter = 128 / 4 = 32
+  // K_VECS_PER_THREAD = 32
+  //  K_PER_ITER how many keys in this loop
+  //  The number of timesteps loaded per iteration.
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  //   // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+  // get k, perform qk proj
+
+  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+    K_vec k[K_VECS_PER_THREAD];
+    int const ti_circ = ti % max_seq_length;
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+      if (ti < tlength) {
+        k[ii] = *reinterpret_cast<K_vec const *>(k_cache_batch +
+                                                 ti_circ * hidden_size +
+                                                 head_idx * per_head_size + jj);
+      }
+      // Compute dot product.
+      // This includes a reduction across the threads in the same thread group.
+    }
+    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+    // // todo add positional embedding to the qk production
+    // // Store the product to shared memory. There's one qk value per
+    // timestep.
+    // // Update the max.
+    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+      // todo add alobi here
+      bool const mask = ti_circ >= tlength;
+      if (mask) {
+        assert(false);
       }
+      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+      qk_smem[ti - first_step] = mask ? 0.f : qk;
+    }
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+  }
+
+  // Decompose the thread index into warp and lane.
+  int const warp = tidx / WARP_SIZE;
+  int const lane = tidx % WARP_SIZE;
+
+  // The warp leader writes the max to shared memory.
+  if (lane == 0) {
+    red_smem[warp] = qk_max;
+  }
+
+  // Make sure the products are in shared memory.
+  __syncthreads();
+
+  // The warps finalize the reduction.
+  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+  }
+
+  // Broadcast to all the threads in the warp.
+  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
+
+  float exp_sum = 0.f;
+  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+    float logit = __expf(qk_smem[ti - first_step] - qk_max);
+    exp_sum += logit;
+    qk_smem[ti - first_step] = logit;
+  }
+
+  // Compute the sum.
+  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+  // softmax
+  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+    qk_smem[ti - first_step] *= inv_sum;
+  }
+
+  __syncthreads();
+  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
+  //   printf("softmax %.10f\n", qk_smem[0]);
+  // }
+
+  // value projection
+  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+  // A vector of V elements for the current timestep.
+  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+  // The value computed by this thread.
+  int vo = tidx / THREADS_PER_VALUE;
+  // The hidden dimensions computed by this particular thread.
+  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+  Out_sum out;
+  zero(out);
+
+  // The base pointer for the value in the cache buffer.
+  DT const *v_cache_batch =
+      value_cache + batch_config_request_id * max_seq_length * hidden_size + vi;
+
+  if (Dh == Dh_MAX || vi < Dh) {
+    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+      // Load the values from the cache.
+      int const ti_circ = ti % max_seq_length;
+
+      V_vec v = *reinterpret_cast<V_vec const *>(
+          v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+      float logit = qk_smem[ti - first_step];
+      out = FlexFlow::fma(logit, cast_to_float(v), out);
     }
-    // Step 6: compute gradients w.r.t query
-    {
-      float alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = 1.0f / sqrt(m->kProjSize);
+  }
+
+  //   // Make sure we can start writing to shared memory.
+  __syncthreads();
+
+  // Run the final reduction amongst the different groups computing different
+  // partial outputs.
+  if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+    for (int active_groups = V_PER_ITER; active_groups >= 2;
+         active_groups /= 2) {
+
+      // The midpoint in the number of active groups.
+      int midpoint = active_groups / 2;
+
+      // The upper part of active threads store to shared memory.
+      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+            out;
       }
-      // matrix A: gradients w.r.t. qk_prods
-      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
-      DT const *A = static_cast<DT *>(m->qk_prods);
-      // matrix B: key cache
-      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray);
-      // after transposition & striding
-      int m_ = num_tokens; // num_new_tokens
-      int n_ = m->qProjSize;
-      int k_ = num_tokens;
-      // before transposition and striding
-      int lda = num_tokens; // num_new_tokens
-      int ldb = m->qProjSize * m->num_q_heads;
-      int ldc = num_tokens;
-      int strideA = num_tokens * num_tokens;
-      int strideB = m->qProjSize;
-      int strideC = num_tokens * m->qProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n_,
-                                           k_,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
-        save_tensor(C,
-                    num_tokens * m->qProjSize * m->num_q_heads * 3,
-                    filename.c_str());
+      __syncthreads();
+
+      // The bottom warps update their values.
+      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                  out);
       }
+      __syncthreads();
     }
+  }
 
-    // Step 7: perform rotary position embeddings (RoPE) bwd
-    {
-      if (*m->apply_rotary_embedding) {
-        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
-        assert(m->qProjSize == m->kProjSize);
-        /*q&k*/
-        int parallelism = num_tokens * m->hidden_size;
-        DT *A = static_cast<DT *>(m->devQKVProjArray);
-        apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
-                                     min(CUDA_NUM_THREADS, parallelism),
-                                     0,
-                                     stream>>>(A,
-                                               m->complex_input,
-                                               m->token_infos,
-                                               m->qProjSize,
-                                               num_tokens,
-                                               m->hidden_size);
-        DT *C = static_cast<DT *>(m->devQKVProjArray);
-        if (m->inference_debugging) {
-          std::string filename =
-              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
-          save_tensor(C,
-                      num_tokens * m->qProjSize * m->num_q_heads * 3,
-                      filename.c_str());
-        }
-      }
+  // Output the final values.
+  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+    convert_from_float(
+        *reinterpret_cast<V_vec *>(output_ptr + request_idx * hidden_size +
+                                   head_idx * per_head_size + vi),
+        out);
+  }
+}
 
-      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C =
-          static_cast<DT *>(m->devQKVProjArray) +
-          num_tokens *
-              (m->qProjSize *
-               m->num_q_heads); // skip over regions reserved for Q gradients
-      if (m->inference_debugging) {
-        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
-        save_tensor(
-            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
+// only used by MPT model. https://arxiv.org/abs/2108.12409
+template <typename DT>
+__global__ void apply_position_bias_qkprd(DT *input_ptr,
+                                          int num_tokens,
+                                          int num_total_tokens,
+                                          int num_heads,
+                                          int global_num_q_heads,
+                                          int shard_id) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) {
+    // get head_idx,
+    int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id);
+    int position_idx = (i / num_tokens) % num_total_tokens;
+    position_idx = position_idx + 1 - num_total_tokens;
+    // 8 is alibi_bias_max in
+    // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json
+    float base = (float)(head_idx + 1) * 8 / global_num_q_heads;
+    float slopes = 1.0 / pow(2, base);
+    // if(i == 0){
+    //   printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes,
+    //   position_idx * slopes);
+    // }
+    input_ptr[i] += static_cast<DT>(position_idx * slopes);
+  }
+}
+
+template <typename DT>
+__global__ void scaling_query_kernel(DT *input_ptr,
+                                     int qProjSize,
+                                     int num_tokens,
+                                     int num_q_heads,
+                                     float scaling_factor,
+                                     int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *=
+        scaling_factor;
+  }
+}
+
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_hf(DT *input_ptr,
+                              cuFloatComplex *complex_input,
+                              BatchConfig::PerTokenInfo const *tokenInfos,
+                              float rope_theta,
+                              bool llama3_rope,
+                              float factor,
+                              float low_freq_factor,
+                              float high_freq_factor,
+                              int original_max_position_embeddings,
+                              int qProjSize,
+                              int kProjSize,
+                              int num_tokens,
+                              size_t q_array_size,
+                              int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // create complex number
+    bool q_tensor = i < (q_array_size / 2);
+    int proj_size = q_tensor ? qProjSize : kProjSize;
+    int real_i = q_tensor ? i : i - q_array_size / 2;
+
+    int token_idx = real_i / (hidden_size / 2);
+    int idx = real_i % (proj_size / 2);
+    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
+
+    int real_part_index = idx + head_idx * proj_size +
+                          token_idx * hidden_size * QKV_WEIGHT_NUM +
+                          hidden_size * (q_tensor ? 0 : 1);
+    int complex_part_index = real_part_index + (proj_size / 2);
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
+    // apply a Cartesian coordinate transformation
+    // multiple with input & /copy back to q/k
+
+    // get position of token
+
+    // size_t pos = id_map[token_idx].token_position;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    // float before_real = complex_input[i].x, before_complex =
+    int pos_i = real_i % (proj_size / 2);
+
+    float freq =
+        pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i
+
+    if (llama3_rope) {
+      float pi = CUDART_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
       }
     }
 
-    // Step 8: compute gradients w.r.t. input
-    {
-      float alpha = 1.0f, beta = 0.0f;
-      if (!m->reset_input_grads[0]) {
-        beta = 1.0f;
-      }
-      // matrix A: QKV projection weights
-      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
-      DT const *A = weight_ptr;
-      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
-      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT const *B = static_cast<DT *>(m->devQKVProjArray);
-      // matrix C: gradients w.r.t. input
-      // matrix C's layout: [m->qSize, num_tokens]
-      DT *C = input_grad_ptr +
-              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
-      int m_ = m->qSize;
-      int n_ = num_tokens;
-      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
-      int lda = m_;
-      int ldb = n_;
-      int ldc = m_;
-      checkCUDA(cublasGemmEx(m->handle.blas,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_T,
-                             m_,
-                             n_,
-                             k_,
-                             &alpha,
-                             A,
-                             cublas_data_type,
-                             lda,
-                             B,
-                             cublas_data_type,
-                             ldb,
-                             &beta,
-                             C,
-                             cublas_data_type,
-                             ldc,
-                             compute_type,
-                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
-        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_bwd(DT *input_ptr,
+                               cuFloatComplex *complex_input,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               float rope_theta,
+                               bool llama3_rope,
+                               float factor,
+                               float low_freq_factor,
+                               float high_freq_factor,
+                               int original_max_position_embeddings,
+                               int proj_size,
+                               int num_tokens,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
+
+    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
+                             head_idx * num_tokens * proj_size +
+                             idx * num_tokens + token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq =
+        pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size)); // θ_i
+
+    if (llama3_rope) {
+      float pi = CUDART_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
       }
     }
+
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
   }
 }
 
-} // namespace IncMultiHeadAttention
-} // namespace Kernels
+template <typename DT>
+void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        int shard_id,
+                        DT *output_ptr,
+                        cudaStream_t stream) {
 
-using namespace Kernels::IncMultiHeadAttention;
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  assert(m->qSize == m->vSize && m->qSize == m->kSize);
+
+  int num_tokens = bc->num_active_tokens();
+  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
+  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
+
+  if (m->scaling_query) {
+    scaling_query_kernel<<<GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream>>>(output_ptr,
+                                     m->qProjSize,
+                                     num_tokens,
+                                     m->num_q_heads,
+                                     m->scaling_factor,
+                                     m->hidden_size);
+  }
+
+  // Step 3: apply rotary embedding if needed
+  if (m->rotary_embedding_meta->apply_rotary_embedding) {
+    /*q&k*/
+    parallelism = num_tokens * m->hidden_size;
+    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
+                                min(CUDA_NUM_THREADS, parallelism),
+                                0,
+                                stream>>>(
+        output_ptr,
+        m->complex_input,
+        m->token_infos,
+        m->rotary_embedding_meta->rope_theta,
+        (m->rotary_embedding_meta->rope_type == "llama3"),
+        m->rotary_embedding_meta->factor,
+        m->rotary_embedding_meta->low_freq_factor,
+        m->rotary_embedding_meta->high_freq_factor,
+        m->rotary_embedding_meta->original_max_position_embeddings,
+        m->qProjSize,
+        m->kProjSize,
+        num_tokens,
+        q_array_size,
+        m->hidden_size);
+  }
+}
 
 template <typename DT>
-__global__ void store_kv_cache(DT const *devQKVProjArray,
-                               DT *kCache_ptr,
-                               DT *vCache_ptr,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int num_tokens,
-                               int max_seq_len,
-                               int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
+void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                            BatchConfig const *bc,
+                            cudaStream_t stream) {
+  int num_tokens = bc->num_active_infr_tokens();
+  if (num_tokens > 0) {
+    int parallelism = m->hidden_size * num_tokens;
+    store_kv_cache<<<GET_BLOCKS(parallelism),
+                     min(CUDA_NUM_THREADS, parallelism),
+                     0,
+                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                               static_cast<DT *>(m->keyCache),
+                               static_cast<DT *>(m->valueCache),
+                               m->token_infos,
+                               num_tokens,
+                               BatchConfig::max_sequence_length(),
+                               m->hidden_size);
+  }
+}
 
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
+  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
+                                   BatchConfig::max_sequence_length(),         \
+                                   THREADS_PER_VALUE,                          \
+                                   THDS_PER_BLOCK);                            \
+  compute_attention_kernel_generation_kernel<DT,                               \
+                                             THDS_PER_BLOCK,                   \
+                                             Dh,                               \
+                                             Dh_MAX,                           \
+                                             THDS_PER_KEY,                     \
+                                             THREADS_PER_VALUE>                \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
+          static_cast<DT *>(m->devQKVProjArray),                               \
+          static_cast<DT *>(m->keyCache),                                      \
+          static_cast<DT *>(m->valueCache),                                    \
+          output_ptr,                                                          \
+          scale,                                                               \
+          BatchConfig::max_sequence_length(),                                  \
+          m->qProjSize,                                                        \
+          m->hidden_size,                                                      \
+          m->request_infos)
 
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+template <typename DT>
+void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
+                                         BatchConfig const *bc,
+                                         DT *output_ptr,
+                                         cudaStream_t stream) {
+  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  size_t smem_sz;
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+  } else {
+    assert(false && "a unsupported head size");
+  }
+}
 
-    // key cache
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
+std::string get_fwd_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                               int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
   }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
 }
 
 template <typename DT>
-__global__ void store_query_cache(DT const *devQKVProjArray,
-                                  DT *qCache_ptr,
-                                  int num_tokens,
-                                  int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
+                      BatchConfig const *bc,
+                      int shard_id,
+                      DT const *qkv_ptr,
+                      DT *output_ptr,
+                      cudaStream_t stream) {
 
-    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
-    DT qVal = devQKVProjArray[val_idx];
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 
-    // query cache
-    qCache_ptr[i] = qVal;
+  // phase 1: Implement kernel to apply rotary embedding and scaling
+  compute_qkv_kernel(
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
+  update_kv_cache_kernel<DT>(m, bc, stream);
+
+  if (bc->num_generation_tokens > 0) {
+    // phase 3: Compute attention score for generation tokens
+    compute_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  }
+
+  if (bc->num_tokens > bc->num_generation_tokens) {
+    // phase 4: Compute attention score for prompt tokens;
+    compute_attention_kernel_prompt<DT>(m, bc, shard_id, stream);
+  }
+
+  int num_tokens = bc->num_active_tokens();
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+}
+
+std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                                int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
+}
+
+__global__ void transposeAdd_half_kernel(
+    half *out, half const *in, int width, int height, half alpha, half beta) {
+  int t_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (int i = t_id; i < width * height; i += num_threads) {
+    int row = i / width;
+    int col = i % width;
+    out[col * height + row] =
+        alpha * in[row * width + col] + beta * out[col * height + row];
+  }
+}
+
+__global__ void transposeAdd_float_kernel(float *out,
+                                          float const *in,
+                                          int width,
+                                          int height,
+                                          float alpha,
+                                          float beta) {
+  int t_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (int i = t_id; i < width * height; i += num_threads) {
+    int row = i / width;
+    int col = i % width;
+    out[col * height + row] =
+        alpha * in[row * width + col] + beta * out[col * height + row];
   }
 }
 
 template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
-                                     BatchConfig const *bc,
-                                     int shard_id,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
-                                     cudaStream_t stream) {
+void transposeAdd(DT *out,
+                  const DT *in,
+                  int width,
+                  int height,
+                  float alpha,
+                  float beta,
+                  cudaStream_t stream) {
+  assert(false && "Unsupported data type");
+}
+
+template <>
+void transposeAdd<float>(float *out,
+                         float const *in,
+                         int width,
+                         int height,
+                         float alpha,
+                         float beta,
+                         cudaStream_t stream) {
+  transposeAdd_float_kernel<<<4, 1024, 0, stream>>>(
+      out, in, width, height, alpha, beta);
+}
+
+template <>
+void transposeAdd<half>(half *out,
+                        half const *in,
+                        int width,
+                        int height,
+                        float alpha,
+                        float beta,
+                        cudaStream_t stream) {
+  transposeAdd_half_kernel<<<4, 1024, 0, stream>>>(
+      out, in, width, height, __float2half(alpha), __float2half(beta));
+}
+
+template <typename DT>
+void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                     BatchConfig const *bc,
+                     int shard_id,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     cudaStream_t stream) {
+  assert(!m->offload);
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   cudaDataType_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
-  int tokens_previous_requests = 0;
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  assert(m->qProjSize == m->kProjSize);
 
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] ||
-        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
-      continue;
-    }
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
-                       bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
-    // Copy query to m->query_activation_buffer if we need to compute
-    // PEFT backward
-    if (bc->requestsInfo[i].peft_bwd) {
-      size_t activation_size_needed =
-          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
-      if (activation_size_needed > m->allocated_peft_buffer_size1) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->query_activation_buffer =
-            allocator->allocate_instance_untyped(activation_size_needed);
-        m->allocated_peft_buffer_size1 = activation_size_needed;
-      }
-      int parallelism = m->hidden_size * num_tokens;
-      store_query_cache<<<GET_BLOCKS(parallelism),
-                          min(CUDA_NUM_THREADS, parallelism),
-                          0,
-                          stream>>>(
-          static_cast<DT *>(m->devQKVProjArray),
-          static_cast<DT *>(m->query_activation_buffer),
-          num_tokens,
-          m->hidden_size);
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
     }
-    // Step 1: compute query-key product QK.T/sqrt(d_k)
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].num_tokens_in_batch;
+    // Currently assume we are calculating gradients for all tokens
+    // of a request
+    assert(num_tokens == num_total_tokens);
+    int kt_block_size = m->kProjSize;
+    int kt_req_block_size =
+        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    int vt_block_size = m->vProjSize;
+    int vt_req_block_size =
+        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+    // Step 1: copy gradient before final projection into workspace
     {
-      // Scale by sqrt(d_k) as per the original attention paper
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+      int m_ = m->vProjSize * m->num_q_heads;
+      int n_ = num_tokens;
+      DT *C = static_cast<DT *>(m->handle.workSpace);
+      cudaMemcpyAsync(C,
+                      output_grad_ptr +
+                          bc->requestsInfo[i].first_token_offset_in_batch *
+                              m->oProjSize,
+                      m_ * n_ * sizeof(DT),
+                      cudaMemcpyDeviceToDevice,
+                      stream);
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0";
+        save_tensor(C, m_ * n_, filename.c_str());
       }
+    }
+    // Step 2: compute gradients w.r.t. value
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: qk_prods_softmax
+      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
+      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) +
+              2 * num_tokens *
+                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
+                                                   // for Q and K gradients
       // after transpositions
-      int m_ = num_new_tokens;
-      int n = total_tokens;
-      int k = m->qProjSize;
+      int m_ = num_tokens;   // total_tokens
+      int n_ = m->vProjSize; // num_new_tokens
+      int k_ = num_tokens;   // num_new_tokens
       // before transpositions
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // total_tokens
       // N.B. strides are applied before transpose operations
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens;
-
-      // matrix A: devQKVProjArray
-      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
-      // To get query projection, skip over Q entries from previous requests
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    bc->requestsInfo[i].first_token_offset_in_batch *
-                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-      // matrix B: key cache
-      // matrix B's layout: [kProjSize * num_heads, total_tokens]
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // matrix C: qk_prods
-      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get C, skip over QK.T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
+      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * m->vProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                            CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
                                            m_,
-                                           n,
-                                           k,
+                                           n_,
+                                           k_,
                                            &alpha,
                                            A,
                                            cublas_data_type,
@@ -1571,57 +1161,80 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0";
+        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax";
+        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
+      }
     }
-    // Step 2: Add alibi position bias to qk production
-    // matrix C: qk_prods
-    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-    // To get C, skip over QK.T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods);
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                  min((size_t)CUDA_NUM_THREADS, parallelism),
-                                  0,
-                                  stream>>>(C,
-                                            num_new_tokens,
-                                            total_tokens,
-                                            m->num_q_heads,
-                                            m->global_num_q_heads,
-                                            shard_id);
-    }
+    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: value cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = num_tokens;
+      int k_ = m->vProjSize;
+      // before transposition and striding
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // num_new_tokens
+      int strideA = m->vProjSize;
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
 
-    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
-    // with -inf to force causal attention.
-    assert(num_new_tokens <= total_tokens);
-    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
-    if (entries_above_diagonal > 0) {
-      size_t parallelism = m->num_q_heads * entries_above_diagonal;
-      fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens,
-                                              m->num_q_heads,
-                                              entries_above_diagonal,
-                                              static_cast<DT>(-INFINITY));
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+        std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache";
+        save_tensor(
+            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
+      }
     }
-
-    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
+    // Step 4: softmax backpropagation
     {
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
+      float alpha = 1.0f, beta = 0.0f;
       int n_param = m->num_q_heads;
-      int c_param = total_tokens;
+      int c_param = num_tokens;
       int h_param = 1;
-      int w_param = num_new_tokens;
+      int w_param = num_tokens;
       checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
                                             CUDNN_TENSOR_NCHW,
                                             cudnn_data_type,
@@ -1629,79 +1242,145 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                             c_param,
                                             h_param,
                                             w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
+      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      m->qk_tensor,
+                                      m->softmax_activation_buffer,
+                                      m->qk_tensor,
+                                      m->qk_prods_softmax,
+                                      &beta,
+                                      m->qk_tensor,
+                                      m->qk_prods));
+
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+
+      //  TODO: fill all elements above diagonal to force causal attention
+      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
+      if (entries_above_diagonal > 0) {
+        size_t parallelism = m->num_q_heads * entries_above_diagonal;
+        fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
+                                      min((size_t)CUDA_NUM_THREADS,
+                                          parallelism),
+                                      0,
+                                      stream>>>(static_cast<DT *>(m->qk_prods),
+                                                num_tokens,
+                                                num_tokens,
+                                                m->num_q_heads,
+                                                entries_above_diagonal,
+                                                DT(0.0f));
+      }
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename = get_peft_dbg_folder(m, shard_id) +
+                               ".qk_prods.softmax_grad_in.masked";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
     }
-    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
-    // PEFT backward
-    if (bc->requestsInfo[i].peft_bwd) {
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      size_t activation_size_needed =
-          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
-      if (activation_size_needed > m->allocated_peft_buffer_size2) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->softmax_activation_buffer =
-            allocator->allocate_instance_untyped(activation_size_needed);
-        m->allocated_peft_buffer_size2 = activation_size_needed;
+    // Step 5: compute gradients w.r.t. key
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: query activation (in query_activation_buffer)
+      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->query_activation_buffer);
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->kProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->kProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->kProjSize;
+      int strideC = num_tokens * m->kProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".query_activation";
+        save_tensor(
+            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".devkproj_pre";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
       }
-      checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer,
-                                C_softmax,
-                                sizeof(DT) * total_tokens * num_new_tokens *
-                                    m->num_q_heads,
-                                cudaMemcpyDeviceToDevice,
-                                stream));
     }
-    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
-    // softmax(QK.T/sqrt(d_k)).T
+    // Step 6: compute gradients w.r.t query
     {
-      DT alpha = 1.0f, beta = 0.0f;
-      // after transpositions
-      int m_ = m->vProjSize;
-      int n = num_new_tokens;
-      int k = total_tokens;
-      // before transpositions
-      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      // N.B. strides are applied before transpose operations
-      int strideA = vt_block_size;
-      int strideB = num_new_tokens * total_tokens;
-      int strideC = m->vProjSize;
-      // matrix A: value cache
-      // matrix A's layout: [vProjSize, num_heads, total_tokens]
-      // To get A, skip over V.T entries from previous requests (all heads +
-      // padding)
-      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // matrix B: qk_prods_softmax
-      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      // matrix C: attn heads
-      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
-      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
-      // requests
-      // store the result attn heads, also skip the genration tokens
-      DT *C = static_cast<DT *>(m->attn_heads) +
-              (bc->requestsInfo[i].first_token_offset_in_batch) *
-                  m->num_q_heads * m->vProjSize;
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: key cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = m->qProjSize;
+      int k_ = num_tokens;
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->qProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                            CUBLAS_OP_N,
                                            CUBLAS_OP_T,
                                            m_,
-                                           n,
-                                           k,
+                                           n_,
+                                           k_,
                                            &alpha,
                                            A,
                                            cublas_data_type,
@@ -1719,30 +1398,109 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
+    }
+
+    // Step 7: perform rotary position embeddings (RoPE) bwd
+    {
+      if (m->rotary_embedding_meta->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
+                                     min(CUDA_NUM_THREADS, parallelism),
+                                     0,
+                                     stream>>>(
+            A,
+            m->complex_input,
+            m->token_infos,
+            m->rotary_embedding_meta->rope_theta,
+            (m->rotary_embedding_meta->rope_type == "llama3"),
+            m->rotary_embedding_meta->factor,
+            m->rotary_embedding_meta->low_freq_factor,
+            m->rotary_embedding_meta->high_freq_factor,
+            m->rotary_embedding_meta->original_max_position_embeddings,
+            m->qProjSize,
+            num_tokens,
+            m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
+      }
+    }
+
+    // Step 8: compute gradients w.r.t. input
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [m->qSize, num_tokens]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      // int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+
+      // The original version uses existing result and attention's projection to
+      // do further calculation in a way different than the usual dense layer,
+      // they are off by a transpose. So an explicit transpose is needed here.
+      // The add here is just for gradient accumulation.
+      transposeAdd(C, B, n_, k_, alpha, beta, stream);
+
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
     }
-    tokens_previous_requests += num_new_tokens;
-  }
-  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
-    bc->print();
-    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-    printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
   }
-  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
 }
 
+} // namespace IncMultiHeadAttention
+} // namespace Kernels
+
+using namespace Kernels::IncMultiHeadAttention;
+
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -1751,43 +1509,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
-    }
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::IncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
-        output.get_half_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
   } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::IncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -1809,12 +1538,9 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorW const &input_grad,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorR const &output_grad,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorR const &output_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -1823,35 +1549,23 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input_grad.data_type == output_grad.data_type);
-  if (use_bias) {
-    assert(input_grad.data_type == bias.data_type);
-  }
 
   if (input_grad.data_type == DT_HALF) {
     assert(!m->offload);
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
                                                     bc,
                                                     shard_id,
                                                     input_grad.get_half_ptr(),
-                                                    weight.get_half_ptr(),
                                                     output_grad.get_half_ptr(),
-                                                    bias_ptr,
                                                     stream);
   } else if (input_grad.data_type == DT_FLOAT) {
     assert(!m->offload);
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
                                                     bc,
                                                     shard_id,
                                                     input_grad.get_float_ptr(),
-                                                    weight.get_float_ptr(),
                                                     output_grad.get_float_ptr(),
-                                                    bias_ptr,
                                                     stream);
   } else {
     assert(false && "Unspported data type");
@@ -1870,7 +1584,6 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _num_q_heads,
@@ -1885,14 +1598,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
+                                    attn->rotary_embedding_meta,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
                                     attn->position_bias,
-                                    attn->final_bias,
                                     attn->scaling_factor,
-                                    weight,
                                     gpu_mem_allocator,
                                     num_samples,
                                     attn->num_q_heads,
@@ -1913,14 +1623,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _kProjSize,
     int _vProjSize,
     int _oProjSize,
-    bool _apply_rotary_embedding,
-    bool _qkv_bias,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool _final_bias,
     float _scaling_factor,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _global_num_q_heads,
@@ -1929,7 +1636,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _num_kv_heads,
     DataType _quantization_type,
     bool _offload)
-    : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) {
+    : OpMeta(handler, attn) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
@@ -1955,29 +1662,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   num_kv_heads = _num_kv_heads;
   hidden_size = num_q_heads * qProjSize;
 
-  weightSize =
-      ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
-           num_q_heads +
-       (kSize * kProjSize + vSize * vProjSize) * num_q_heads) *
-      size_of_dt;
-  if (quantization_type != DT_NONE) {
-    quantized_weightSize = get_quantization_to_byte_size(
-        attn->data_type, quantization_type, weightSize);
-  }
-  // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0;
-
-  int qkv_bias_size =
-      qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-  int final_bias_size = oProjSize;
-  biasSize =
-      (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0);
-
-  // has_load_weights = (bool *)calloc(1, sizeof(bool));
-  //*has_load_weights = false;
-  apply_rotary_embedding = (bool *)calloc(1, sizeof(bool));
-  *apply_rotary_embedding = _apply_rotary_embedding;
-  qkv_bias = (bool *)calloc(1, sizeof(bool));
-  *qkv_bias = _qkv_bias;
+  rotary_embedding_meta =
+      (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
+  *rotary_embedding_meta = _rotary_embedding_meta;
   scaling_query = (bool *)calloc(1, sizeof(bool));
   *scaling_query = _scaling_query;
   scaling_factor = _scaling_factor;
@@ -1985,14 +1672,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   *qk_prod_scaling = _qk_prod_scaling;
   position_bias = (bool *)calloc(1, sizeof(bool));
   *position_bias = _position_bias;
-  final_bias = (bool *)calloc(1, sizeof(bool));
-  *final_bias = _final_bias;
-
-  // allocate weight and bias in the reserve space for cpu offloading
-  if (offload) {
-    weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(weightSize);
-    bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize);
-  }
 
   // allocate memory for the seqArray and reserve space
   {
@@ -2058,9 +1737,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                ? key_cache_size + value_cache_size + qkv_max_proj_size
                : key_cache_size + value_cache_size);
 
-      if (quantization_type != DT_NONE) {
-        totalSharedSize += quantized_weightSize;
-      }
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              totalSharedSize);
@@ -2091,29 +1767,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         handler.batch_config_metadata->requestsInfo);
 
     if (offload) {
-      // token_infos =
-      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
-      //         tokeninfo_size);
-      // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size;
       qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
                                                              size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
       qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped(
           qk_prod_size * size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
       attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
                                                                size_of_dt);
-      // offset += attn_heads_size * size_of_dt;
       complex_input =
           gpu_mem_allocator.allocate_reserved<cuFloatComplex>(complex_size);
-      // offset += complex_size * sizeof(cuFloatComplex);
-      // request_infos =
-      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerRequestInfo>(
-      //         requestinfo_size);
     } else {
-      // token_infos =
-      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
-      //         tokeninfo_size);
       qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
@@ -2122,16 +1784,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                                size_of_dt);
       complex_input =
           gpu_mem_allocator.allocate_instance<cuFloatComplex>(complex_size);
-      // request_infos =
-      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerRequestInfo>(
-      //         requestinfo_size);
     }
 
     // allocate more size for quantization data
     if (quantization_type != DT_NONE) {
       assert(offload);
-      quantized_weight_ptr =
-          gpu_mem_allocator.allocate_reserved<char>(quantized_weightSize);
     }
     if (!offload) {
       assert(gpu_mem_allocator.reserved_total_size ==
@@ -2149,49 +1806,32 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {
   }
 }
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        float *output_ptr,
+        cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        half *output_ptr,
+        cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     int shard_id,
     float *output_ptr,
-    float const *weight_ptr,
-    float const *bias_ptr,
-    int num_tokens,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     int shard_id,
     half *output_ptr,
-    half const *weight_ptr,
-    half const *bias_ptr,
-    int num_tokens,
     cudaStream_t stream);
 
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        float *output_ptr,
-        cudaStream_t stream);
-
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        half *output_ptr,
-        cudaStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index d4f930db6c..3835d258e0 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -511,6 +511,7 @@ void forward_kernel(LinearMeta const *m,
                          out_dim,
                          compute_type,
                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
   // use_bias = True
   if (bias_ptr != NULL) {
     // fuse bias and relu
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 20ad762b62..09170d3c28 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -668,11 +668,11 @@ void Linear::inference_task(Task const *task,
     }
     Linear::save_inference_tensors_to_file(
         m, shard_id, bc, {input}, weights_accessors, {output});
-    printf("\tin=[%i,%i].T @ w=[%i,%i] -> out=[%i,%i]\n",
-           in_dim,
-           bc->num_tokens,
+    printf("\tw=[%i,%i].T @ in=[%i,%i] -> out=[%i,%i]\n",
            in_dim,
            out_dim,
+           in_dim,
+           bc->num_tokens,
            out_dim,
            bc->num_tokens);
   }
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 2a30d12d6d..ce4150f9d6 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -988,9 +988,20 @@ void ResidualLayerNorm::inference_task(
     return;
   }
 
-  assert(regions.size() ==
-         3 + m->use_two_residuals +
-             (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
+  int expected_num_regions = 4; // input, residual1, added_output, output
+  if (m->use_two_residuals) {
+    expected_num_regions++; // residual2
+  }
+  if (m->inplace_residual) {
+    expected_num_regions--; // added_output = input
+  }
+  if (m->elementwise_affine) {
+    expected_num_regions += 1; // gamma
+    if (m->use_bias) {
+      expected_num_regions += 1; // beta
+    }
+  }
+  assert(regions.size() == expected_num_regions);
 
   int region_idx = 0, task_region_idx = 0;
   GenericTensorAccessorR input =
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 52da51fb26..aa74ecc6f5 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -52,24 +52,22 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid(
   return is_valid;
 }
 
-Tensor
-    FFModel::spec_inc_multihead_self_attention(Tensor const input,
-                                               int embed_dim,
-                                               int num_heads,
-                                               int kdim,
-                                               int vdim,
-                                               float dropout,
-                                               bool qkv_bias,
-                                               bool final_bias,
-                                               bool add_zero_attn,
-                                               DataType data_type,
-                                               Initializer *kernel_initializer,
-                                               bool apply_rotary_embedding,
-                                               bool scaling_query,
-                                               float scaling_factor,
-                                               bool qk_prod_scaling,
-                                               bool position_bias,
-                                               char const *name) {
+Tensor FFModel::spec_inc_multihead_self_attention(
+    Tensor const input,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    char const *name) {
   return spec_inc_multiquery_self_attention(input,
                                             embed_dim,
                                             num_heads,
@@ -77,12 +75,10 @@ Tensor
                                             kdim,
                                             vdim,
                                             dropout,
-                                            qkv_bias,
-                                            final_bias,
                                             add_zero_attn,
                                             data_type,
                                             kernel_initializer,
-                                            apply_rotary_embedding,
+                                            rotary_embedding_meta,
                                             scaling_query,
                                             scaling_factor,
                                             qk_prod_scaling,
@@ -90,30 +86,27 @@ Tensor
                                             name);
 }
 
-Tensor
-    FFModel::spec_inc_multiquery_self_attention(Tensor const input,
-                                                int embed_dim,
-                                                int num_q_heads,
-                                                int num_kv_heads,
-                                                int kdim,
-                                                int vdim,
-                                                float dropout,
-                                                bool qkv_bias,
-                                                bool final_bias,
-                                                bool add_zero_attn,
-                                                DataType data_type,
-                                                Initializer *kernel_initializer,
-                                                bool apply_rotary_embedding,
-                                                bool scaling_query,
-                                                float scaling_factor,
-                                                bool qk_prod_scaling,
-                                                bool position_bias,
-                                                char const *name) {
+Tensor FFModel::spec_inc_multiquery_self_attention(
+    Tensor const input,
+    int embed_dim,
+    int num_q_heads,
+    int num_kv_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
   Layer *li = nullptr;
-  int weight_num = (qkv_bias || final_bias) ? 2 : 1;
   if (data_type != input->data_type) {
     Tensor casted_input = cast(input, data_type, "type cast for IncMHA");
     li = new Layer(this,
@@ -121,7 +114,7 @@ Tensor
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0 /*weights*/,
                    1 /*outputs*/,
                    casted_input);
   } else {
@@ -130,7 +123,7 @@ Tensor
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0 /*weights*/,
                    1 /*outputs*/,
                    input);
   }
@@ -144,51 +137,26 @@ Tensor
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
-  // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
-  int weight_size = qParas * num_q_heads + kParas * num_q_heads +
-                    vParas * num_q_heads + oParas * num_q_heads;
-  {
-    int dims[1] = {weight_size};
-    li->weights[0] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
-  if (qkv_bias || final_bias) {
-    // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
-    li->weights[1] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
+
   li->data_type = data_type;
   li->add_int_property("embed_dim", embed_dim);
   li->add_int_property("num_q_heads", num_q_heads);
   li->add_int_property("num_kv_heads", num_kv_heads);
   li->add_int_property("kdim", kdim);
   li->add_int_property("vdim", vdim);
-  li->add_int_property("qkv_bias", qkv_bias);
-  li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
@@ -216,14 +184,20 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   int vdim = value;
   float dropout;
   layer->get_float_property("dropout", dropout);
-  layer->get_int_property("qkv_bias", value);
-  bool qkv_bias = (bool)value;
-  layer->get_int_property("final_bias", value);
-  bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
@@ -242,15 +216,12 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
                                            kdim,
                                            vdim,
                                            dropout,
-                                           qkv_bias,
-                                           final_bias,
                                            add_zero_attn,
-                                           apply_rotary_embedding,
+                                           rotary_embedding_meta,
                                            scaling_query,
                                            scaling_factor,
                                            qk_prod_scaling,
                                            position_bias,
-                                           false /*allocate_weights*/,
                                            layer->name);
 }
 
@@ -264,29 +235,24 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     int _kdim,
     int _vdim,
     float _dropout,
-    bool _qkv_bias,
-    bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool allocate_weights,
     char const *name)
-    // Initializer* _bias_initializer)
     : Op(model,
          OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0,
          1 /*outputs*/,
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
-      qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       qSize(_input->dims[0].size), kSize(_input->dims[0].size),
       vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
@@ -305,99 +271,44 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
   dims[0].size = _embed_dim;
   // Currently require no parallelism along this dim
   assert(dims[0].degree == 1);
-  if (allocate_weights) {
-    // Create weight tensor
-    int num_dims = inputs[0]->num_dims;
-    // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
-    int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
-    ParallelDim dims[2];
-    dims[0] = inputs[0]->dims[num_dims - 2];
-    dims[0].size = dims[0].degree;
-    dims[1] = inputs[0]->dims[num_dims - 1];
-    dims[1].size = this->num_q_heads * (qParas + oParas) +
-                   this->num_q_heads * (kParas + vParas);
-    dims[1].is_replica_dim = false;
-    int seed = std::rand();
-    Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(dims,
-                                                 this->data_type,
-                                                 NULL /*owner_op*/,
-                                                 true /*create_grad*/,
-                                                 initializer,
-                                                 CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
-  /* for (int i = 0; i < numdim; i++) { */
-  /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-  /* } */
-  /* // Check correctness */
-  /* assert(check_output_input_weight_parallel_dims()); */
 }
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     FFModel &model,
     ParallelTensor const _input,
-    ParallelTensor const _weight,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
     int _kdim,
     int _vdim,
     float _dropout,
-    bool _qkv_bias,
-    bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool allocate_weights,
     char const *name)
-    // Initializer* _bias_initializer)
     : Op(model,
          OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0 /*weights*/,
          1 /*outputs*/,
-         _input,
-         _weight),
+         _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
-      qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       qSize(_input->dims[0].size), kSize(_input->dims[0].size),
       vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias)
-// bias_initializer(_bias_initializer)
-{
+      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) {
   numOutputs = 1;
   int numdim = _input->num_dims;
   ParallelDim dims[MAX_TENSOR_DIM];
@@ -407,66 +318,15 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
   dims[0].size = _embed_dim;
   // Currently require no parallelism along this dim
   assert(dims[0].degree == 1);
-  if (allocate_weights) {
-    // Create weight tensor
-    int num_dims = inputs[0]->num_dims;
-    // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
-    int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
-    ParallelDim dims[2];
-    dims[0] = inputs[0]->dims[num_dims - 2];
-    dims[0].size = dims[0].degree;
-    dims[1] = inputs[0]->dims[num_dims - 1];
-    dims[1].size = this->num_q_heads * (qParas + oParas) +
-                   this->num_q_heads * (kParas + vParas);
-    dims[1].is_replica_dim = false;
-    // dims[2].size = qParas + kParas + vParas + oParas;
-    int seed = std::rand();
-    Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(dims,
-                                                 this->data_type,
-                                                 NULL /*owner_op*/,
-                                                 true /*create_grad*/,
-                                                 initializer,
-                                                 CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
-
-  /* for (int i = 0; i < numdim; i++) { */
-  /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-  /* } */
-  /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */
-  /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */
-  // Check correctness
-  /* assert(check_output_input_weight_parallel_dims()); */
 }
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     FFModel &model,
     SpecIncMultiHeadSelfAttention const &other,
-    ParallelTensor const input,
-    bool allocate_weights)
+    ParallelTensor const input)
     : SpecIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
                                     input,
@@ -476,22 +336,18 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     other.qProjSize,
                                     other.vProjSize,
                                     other.dropout,
-                                    other.qkv_bias,
-                                    other.final_bias,
                                     other.add_zero_attn,
-                                    other.apply_rotary_embedding,
+                                    other.rotary_embedding_meta,
                                     other.scaling_query,
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
                                     other.position_bias,
-                                    allocate_weights,
                                     other.name) {}
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     FFModel &model,
     SpecIncMultiHeadSelfAttentionParams const &params,
     ParallelTensor const &input,
-    bool allocate_weights,
     char const *name)
     : SpecIncMultiHeadSelfAttention(model,
                                     params.layer_guid,
@@ -502,15 +358,12 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.kdim,
                                     params.vdim,
                                     params.dropout,
-                                    params.qkv_bias,
-                                    params.final_bias,
                                     params.add_zero_attn,
-                                    params.apply_rotary_embedding,
+                                    params.rotary_embedding_meta,
                                     params.scaling_query,
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
                                     params.position_bias,
-                                    allocate_weights,
                                     params.name) {}
 
 void SpecIncMultiHeadSelfAttention::init_inference(
@@ -541,18 +394,12 @@ void SpecIncMultiHeadSelfAttention::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -580,18 +427,12 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -599,8 +440,7 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) {
 
 /*
   regions[0](I): input
-  regions[1](I): weight
-  regions[2](O): output
+  regions[1](O): output
 */
 OpMeta *SpecIncMultiHeadSelfAttention::init_task(
     Task const *task,
@@ -618,17 +458,10 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR weight =
-      helperGetGenericTensorAccessorRO(attn->weights[0]->data_type,
-                                       regions[1],
-                                       task->regions[1],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type,
-                                       regions[2],
-                                       task->regions[2],
+                                       regions[1],
+                                       task->regions[1],
                                        FID_DATA,
                                        ctx,
                                        runtime);
@@ -643,14 +476,8 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   // We don't do offloading for SSMs (small speculative models)
-  SpecIncMultiHeadSelfAttentionMeta *m =
-      new SpecIncMultiHeadSelfAttentionMeta(handle,
-                                            attn,
-                                            weight,
-                                            gpu_mem_allocator,
-                                            num_samples,
-                                            num_q_heads,
-                                            num_kv_heads);
+  SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta(
+      handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads);
   // assert that we didn't over allocate memory
   assert(gpu_mem_allocator.instance_allocated_size ==
          gpu_mem_allocator.instance_total_size);
@@ -658,8 +485,6 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   m->inference_debugging = attn->inference_debugging;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
-  assert(weight.domain.get_volume() * data_type_size(weight.data_type) ==
-         m->weightSize);
   return m;
 }
 
@@ -697,12 +522,6 @@ FutureMap SpecIncMultiHeadSelfAttention::inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
@@ -710,21 +529,12 @@ FutureMap SpecIncMultiHeadSelfAttention::inference(
                                                     batch_outputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
 
-  if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[1]->region));
-    launcher.add_field(idx++, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
   regions[0](I): input
-  regions[3](I): weight
-  regions[4](O): output
+  regions[1](O): output
 */
 void SpecIncMultiHeadSelfAttention::inference_task(
     Task const *task,
@@ -741,51 +551,29 @@ void SpecIncMultiHeadSelfAttention::inference_task(
 
   SpecIncMultiHeadSelfAttentionMeta *m =
       *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args);
-  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
-                                           : regions.size() == 3));
+  assert(regions.size() == 2);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR biases;
-  if (*m->qkv_bias || *m->final_bias) {
-    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                              regions[3],
-                                              task->regions[3],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime);
-    Domain bias_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    assert(bias_domain.get_dim() == 4);
-  }
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain weight_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[1].region.get_index_space());
 
   assert(input_domain.get_dim() == 4);
-  assert(weight_domain.get_dim() == 2);
   assert(output_domain.get_dim() == 4);
 
   assert(task->index_point.get_dim() == 1);
   SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, &bc, task->index_point.point_data[0], input, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(weight);
-    if (*m->qkv_bias || *m->final_bias) {
-      weights_accessors.push_back(biases);
-    }
     SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, &bc, {input}, {}, {output});
   }
 }
 
@@ -809,8 +597,7 @@ Op *SpecIncMultiHeadSelfAttention::materialize(FFModel &ff,
                                                ParallelTensor inputs[],
                                                int num_inputs) const {
   SpecIncMultiHeadSelfAttentionParams params = get_params();
-  return new SpecIncMultiHeadSelfAttention(
-      ff, params, inputs[0], true, this->name);
+  return new SpecIncMultiHeadSelfAttention(ff, params, inputs[0], this->name);
 }
 
 bool SpecIncMultiHeadSelfAttention::measure_operator_cost(
@@ -823,9 +610,20 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs,
   return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim &&
          lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim &&
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
-         lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -842,10 +640,8 @@ SpecIncMultiHeadSelfAttentionParams
   params.kdim = this->kProjSize;
   params.vdim = this->vProjSize;
   params.dropout = this->dropout;
-  params.qkv_bias = this->qkv_bias;
-  params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -870,10 +666,15 @@ size_t hash<FlexFlow::SpecIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.kdim);
   hash_combine(key, params.vdim);
   hash_combine(key, params.dropout);
-  hash_combine(key, params.qkv_bias);
-  hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index aebd5e8892..b2f4e35d5e 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -16,6 +16,7 @@
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
@@ -26,13 +27,310 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 
+#define WARP_SIZE 32
+
 using namespace Kernels::IncMultiHeadAttention;
 
 namespace Kernels {
-namespace SpecIncMultiHeadAttention {
+namespace SpecIncMultiHeadSelfAttention {
+
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_spec_inc_attention_kernel_generation_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int const max_seq_length,
+    int per_head_size,
+    int hidden_size,
+    BatchConfig::PerRequestInfo *request_infos,
+    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos,
+    BatchConfig::BitMask *causalMask,
+    bool *request_completed) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // nth request idx
+  int const request_idx = blockIdx.y;
+
+  // request id in batch config
+  int const batch_config_request_id =
+      request_infos[request_idx].batch_config_request_id;
+
+  // request_idx = re
+
+  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
+
+  int const first_step = 0;
+
+  // int const tlength =
+  //     request_infos[batch_config_request_id].first_token_depth_in_request +
+  //     request_infos[batch_config_request_id].num_tokens_in_batch;
+
+  int const totalCacheSize =
+      bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1;
+
+  int first_token_idx = 0;
+  for (int r = 0; r < batch_config_request_id; r++) {
+    first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size;
+  }
+
+  int const tree_branch_num =
+      beam_request_infos[batch_config_request_id].sub_request_num;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+
+  for (int qi = 0; qi < tree_branch_num; qi += 1) {
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
+          ii * THREADS_PER_KEY * K_VEC_SIZE);
+    }
+
+    int const query_token =
+        bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi;
+
+    __syncthreads();
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      K_vec k[K_VECS_PER_THREAD];
+      int const ti_circ = ti % max_seq_length;
+
+      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+        if (ti < totalCacheSize) {
+
+          k[ii] = *reinterpret_cast<K_vec const *>(
+              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
+              jj);
+        }
+      }
+      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+
+      if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) {
+        // todo add alobi here
+        // bool const mask = ti_circ >= totalCacheSize;
+        bool const mask = (ti >= bitmask.non_tree_cache_size &&
+                           (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                              (1 << query_token))));
+
+        // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
+        //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
+        //          batch_config_request_id,
+        //          ti,
+        //          qk,
+        //          qi);
+        // }
+        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+        qk_smem[ti - first_step] = mask ? 0.f : qk;
+      }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+      qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+    }
+
+    // Decompose the thread index into warp and lane.
+    int const warp = tidx / WARP_SIZE;
+    int const lane = tidx % WARP_SIZE;
+
+    // The warp leader writes the max to shared memory.
+    if (lane == 0) {
+      red_smem[warp] = qk_max;
+    }
+
+    // Make sure the products are in shared memory.
+    __syncthreads();
+
+    // The warps finalize the reduction.
+    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+      qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+    }
+
+    // Broadcast to all the threads in the warp.
+    qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0);
+
+    // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
+    //   printf("spec inc attn first token qk_max %.10f\n", qk_max);
+    // }
+
+    float exp_sum = 0.f;
+    for (int ti = first_step + tidx; ti < totalCacheSize;
+         ti += THREADS_PER_BLOCK) {
+      bool const mask = (ti >= bitmask.non_tree_cache_size &&
+                         (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                            (1 << query_token))));
+      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
+      exp_sum += logit;
+      qk_smem[ti - first_step] = mask ? 0.0f : logit;
+    }
+
+    // Compute the sum.
+    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+    // softmax
+    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+    for (int ti = first_step + tidx; ti < totalCacheSize;
+         ti += THREADS_PER_BLOCK) {
+      qk_smem[ti - first_step] *= inv_sum;
+    }
+
+    __syncthreads();
+
+    // value projection
+    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+    // A vector of V elements for the current timestep.
+    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+    // The value computed by this thread.
+    int vo = tidx / THREADS_PER_VALUE;
+    // The hidden dimensions computed by this particular thread.
+    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+    Out_sum out;
+    zero(out);
+
+    // The base pointer for the value in the cache buffer.
+    DT const *v_cache_batch =
+        value_cache + batch_config_request_id * max_seq_length * hidden_size +
+        vi;
+
+    if (Dh == Dh_MAX || vi < Dh) {
+      for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) {
+        // Load the values from the cache.
+        int const ti_circ = ti % max_seq_length;
+        V_vec v = *reinterpret_cast<V_vec const *>(
+            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+
+        bool const mask = (ti >= bitmask.non_tree_cache_size &&
+                           (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                              (1 << query_token))));
+        float logit = mask ? 0.0f : qk_smem[ti - first_step];
+        out = FlexFlow::fma(logit, cast_to_float(v), out);
+      }
+    }
+
+    //   // Make sure we can start writing to shared memory.
+    __syncthreads();
+
+    // Run the final reduction amongst the different groups computing different
+    // partial outputs.
+    if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+      for (int active_groups = V_PER_ITER; active_groups >= 2;
+           active_groups /= 2) {
+
+        // The midpoint in the number of active groups.
+        int midpoint = active_groups / 2;
+
+        // The upper part of active threads store to shared memory.
+        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+              out;
+        }
+        __syncthreads();
+
+        // The bottom warps update their values.
+        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                    out);
+        }
+        __syncthreads();
+      }
+    }
+
+    // Output the final values.
+    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+      convert_from_float(*reinterpret_cast<V_vec *>(
+                             output_ptr + (first_token_idx + qi) * hidden_size +
+                             head_idx * per_head_size + vi),
+                         out);
+    }
+  }
+}
 
 template <typename DT>
-__global__ void spec_store_kv_cache(
+__global__ void spec_inc_store_kv_cache(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
@@ -40,16 +338,16 @@ __global__ void spec_store_kv_cache(
     BatchConfig::PerRequestInfo *requestInfo,
     BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
     BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
+    BatchConfig::BitMask *causalMask,
     int qProjSize,
     int kProjSize,
     int vProjSize,
     int num_tokens,
     int max_seq_len,
-    int max_beam_width,
     bool is_root,
     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) {
-    int token_idx = i / (hidden_size * KV_WEIGHT_NUM);
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / (hidden_size);
     int offset = i % hidden_size;
 
     size_t val_idx =
@@ -58,82 +356,25 @@ __global__ void spec_store_kv_cache(
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
-    // above no need to be changed
-    // int const req_id = id_map[token_idx].request_index;
-    // int const tok_id = id_map[token_idx].token_position;
-    // int const sub_req_id = id_map[token_idx].sub_request_index;
-    // int const parent_id = id_map[token_idx].parent_id;
-    // int const beam_depth = id_map[token_idx].beam_depth;
-    // int const beam_width = id_map[token_idx].beam_width;
-
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-    int const sub_req_id = beamTokenInfos[token_idx].sub_request_index;
-    int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id];
-    int const beam_depth = beamRequestInfos[req_id].current_depth;
-    int const beam_width = beamRequestInfos[req_id].beam_size;
-
-    // new token
-    kCache_ptr[(req_id * max_beam_width + sub_req_id) *
-                   (hidden_size * max_seq_len) +
-               tok_id * hidden_size + offset] = kVal;
-    vCache_ptr[(req_id * max_beam_width + sub_req_id) *
-                   (hidden_size * max_seq_len) +
-               tok_id * hidden_size + offset] = vVal;
-
-    // replica in the root iteration
-    if (beam_depth == 1) {
-      for (int i = 1; i < beam_width; i++) {
-        kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) +
-                   tok_id * hidden_size + offset] = kVal;
-        vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) +
-                   tok_id * hidden_size + offset] = vVal;
-      }
-    }
+    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
 
-    // naive cache stealing
-    if (sub_req_id != parent_id) {
-      if (offset == 0 && tok_id == 0) {
-        printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid "
-               "%d, tok_id %d\n",
-               beam_depth,
-               req_id,
-               sub_req_id,
-               parent_id,
-               tok_id);
-      }
+    int const request_token_offset =
+        requestInfo[req_id].first_token_offset_in_batch;
 
-      for (int depth = 0; depth < beam_depth; depth++) {
-        int steal_token_idx = tok_id - beam_depth + depth;
-        int steal_from_idx = (req_id * max_beam_width + parent_id) *
-                                 (hidden_size * max_seq_len) +
-                             steal_token_idx * hidden_size + offset;
-        int steal_to_idx = (req_id * max_beam_width + sub_req_id) *
-                               (hidden_size * max_seq_len) +
-                           steal_token_idx * hidden_size + offset;
-        kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx];
-        vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx];
-
-        //   if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){
-        //     printf("cache stealing kernel!, steal_token_idx %d\n",
-        //     steal_token_idx);
-        // }
-      }
-    }
+    BatchConfig::BitMask bitmask = causalMask[req_id];
 
-    // parallel cache stealing not yet implemented
-    // logic shld be
-    // launch spec_store_kv_cache with parallelism * current depth
-    // from the i here, get depth index
-    // if depth index not the current one, check if we need to steal
-    // steal if needed
-
-    // cache stealing theory
-    // identify which sub request does this token come from
-    // for initial token, 0
-    // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and
-    // which to be delete copy beam_size bunch of blocks when sub_req_id ==
-    // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v
+    // if prompt token -> token id
+    // if tree token:
+
+    int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size +
+                          bitmask.tree_size - 1 - bitmask.this_layer_size +
+                          token_idx - request_token_offset;
+
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
+               offset] = vVal;
   }
 }
 
@@ -143,11 +384,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
-  // printf("curr depth: %d\n", curr_depth);
-  // assert(curr_depth < 3);
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_inc_store_kv_cache<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -159,17 +398,71 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                        m->request_infos,
                        m->beam_token_infos,
                        m->beam_request_infos,
+                       m->causalMask,
                        m->qProjSize,
                        m->kProjSize,
                        m->vProjSize,
                        num_tokens,
-                       BatchConfig::max_sequence_length(),
-                       BeamSearchBatchConfig::MAX_BEAM_WIDTH,
+                       BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num(),
                        /*root*/ curr_depth == 0,
                        m->hidden_size);
   }
 }
 
+#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(                                \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
+  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
+                                   BatchConfig::max_sequence_length() +        \
+                                       BatchConfig::max_spec_tree_token_num(), \
+                                   THREADS_PER_VALUE,                          \
+                                   THDS_PER_BLOCK);                            \
+  compute_spec_inc_attention_kernel_generation_kernel<DT,                      \
+                                                      THDS_PER_BLOCK,          \
+                                                      Dh,                      \
+                                                      Dh_MAX,                  \
+                                                      THDS_PER_KEY,            \
+                                                      THREADS_PER_VALUE>       \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
+          static_cast<DT *>(m->devQKVProjArray),                               \
+          static_cast<DT *>(m->keyCache),                                      \
+          static_cast<DT *>(m->valueCache),                                    \
+          output_ptr,                                                          \
+          scale,                                                               \
+          BatchConfig::max_sequence_length() +                                 \
+              BatchConfig::max_spec_tree_token_num(),                          \
+          m->qProjSize,                                                        \
+          m->hidden_size,                                                      \
+          m->request_infos,                                                    \
+          m->beam_request_infos,                                               \
+          m->causalMask,                                                       \
+          m->request_completed)
+
+template <typename DT>
+void compute_spec_inc_attention_kernel_generation(
+    SpecIncMultiHeadSelfAttentionMeta const *m,
+    BeamSearchBatchConfig const *bc,
+    DT *output_ptr,
+    hipStream_t stream) {
+  // one block == one head per request
+  // how many generation requests
+  dim3 grid(m->num_q_heads, bc->get_speculative_request_num());
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  size_t smem_sz;
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+  } else {
+    assert(false && "a unsupported head size");
+  }
+}
+
 template <typename DT>
 __global__ void spec_fill_entries_above_diagonal(DT *matrix,
                                                  size_t new_tokens,
@@ -188,331 +481,268 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix,
 }
 
 template <typename DT>
-void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                              BeamSearchBatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              hipStream_t stream) {
+void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
+                                     BeamSearchBatchConfig const *bc,
+                                     int shard_id,
+                                     DT *output_ptr,
+                                     hipStream_t stream) {
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   hipblasDatatype_t compute_type = hipblas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   hipblasDatatype_t compute_type = hipblas_data_type;
-  // #else
-  //   // TODO: currently use the hipblas_data_type
-  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   hipblasDatatype_t compute_type = hipblas_data_type;
-  // #endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_infr_tokens();
+
+  int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens;
   int q_block_size = m->qProjSize;
+
   int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+  int kt_req_block_size = kt_block_size * m->num_q_heads *
+                          (BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
   int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+  int vt_req_block_size = vt_block_size * m->num_q_heads *
+                          (BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) ||
+        (bc->requestsInfo[i].num_tokens_in_batch == 0)) {
+      continue;
+    } else if (tokens_previous_requests < bc->num_generation_tokens) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
-    for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {
-
-      // int num_new_tokens = bc->num_processing_tokens[i];
-      // int total_tokens = bc->token_last_available_idx[i] + 1;
-
-      int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
-                         bc->requestsInfo[i].num_tokens_in_batch;
-      // Compute (QK^T/sqrt(d_k))
-      int m_ = num_new_tokens;
-      int n = total_tokens;
-      int k = m->qProjSize;
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens;
-
-      // a flag of using this scaling alpha
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // To get A, skip over Q entries from previous requests (same head)
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    tokens_previous_requests * m->qProjSize * m->num_q_heads *
-                        QKV_WEIGHT_NUM;
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) +
-                    (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size;
-
-      // if (i == 0 && sub_req_id == 0 &&
-      //     bc->beam_slots.at(0).current_depth == 1) {
-      //   int offset = (float *)B - m->keyCache;
-      //   printf("key cache offset %d\n", kt_req_block_size);
-      // }
-      // To get C, skip over QK^T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods) +
-              m->num_q_heads * tokens_prev_requests_squares;
-
-      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_T,
-                                            HIPBLAS_OP_N,
-                                            m_,
-                                            n,
-                                            k,
-                                            &alpha,
-                                            A,
-                                            hipblas_data_type,
-                                            lda,
-                                            strideA,
-                                            B,
-                                            hipblas_data_type,
-                                            ldb,
-                                            strideB,
-                                            &beta,
-                                            C,
-                                            hipblas_data_type,
-                                            ldc,
-                                            strideC,
-                                            m->num_q_heads,
-                                            compute_type,
-                                            HIPBLAS_GEMM_DEFAULT));
-
-      if (*m->position_bias) {
-        size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd<DT>),
-                           GET_BLOCKS(parallelism),
-                           min((size_t)CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream,
-                           C,
-                           num_new_tokens,
-                           total_tokens,
-                           m->num_q_heads,
-                           m->global_num_q_heads,
-                           shard_id);
-      }
 
-      // Fill all elements above diagonal in qk prods with -inf to force
-      // causal attention.
-      assert(num_new_tokens <= total_tokens);
-      if (num_new_tokens > 1) {
-        size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens;
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(spec_fill_entries_above_diagonal<DT>),
-            GET_BLOCKS(parallelism),
-            min((size_t)CUDA_NUM_THREADS, parallelism),
-            0,
-            stream,
-            C,
-            num_new_tokens,
-            total_tokens,
-            m->num_q_heads,
-            static_cast<DT>(-INFINITY));
-      }
-      // Compute Softmax(QK^T/sqrt(d_k))
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(miopenSet4dTensorDescriptor(
-          m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax) +
-                      m->num_q_heads * tokens_prev_requests_squares;
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
-                                         &softmax_alpha,
-                                         m->qk_tensor,
-                                         C,
-                                         &softmax_beta,
-                                         m->qk_tensor,
-                                         C_softmax,
-                                         MIOPEN_SOFTMAX_ACCURATE,
-                                         MIOPEN_SOFTMAX_MODE_CHANNEL));
-      // Matmul softmax(QK^T/sqrt(d_k)) by V
-      alpha = 1.0f, beta = 0.0f;
-      m_ = num_new_tokens;
-      n = m->vProjSize;
-      k = total_tokens;
-      lda = m_, ldb = n * m->num_q_heads, ldc = m_;
-      strideA = num_new_tokens * total_tokens;
-      strideB = vt_block_size;
-      strideC = num_new_tokens * m->vProjSize;
-      // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      A = C_softmax;
-      // To get B, skip over V^T entries from previous requests (all heads +
-      // padding)
-      B = static_cast<DT *>(m->valueCache) +
-          (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size;
-      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-      // requests
-      C = static_cast<DT *>(m->attn_heads) +
-          tokens_previous_requests * m->num_q_heads * m->vProjSize;
-
-      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_N,
-                                            HIPBLAS_OP_T,
-                                            m_,
-                                            n,
-                                            k,
-                                            &alpha,
-                                            A,
-                                            hipblas_data_type,
-                                            lda,
-                                            strideA,
-                                            B,
-                                            hipblas_data_type,
-                                            ldb,
-                                            strideB,
-                                            &beta,
-                                            C,
-                                            hipblas_data_type,
-                                            ldc,
-                                            strideC,
-                                            m->num_q_heads,
-                                            compute_type,
-                                            HIPBLAS_GEMM_DEFAULT));
-
-      // Project to output, save result directly on output tensor
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->oProjSize;
-      k = m->vProjSize * m->num_q_heads;
-      n = num_new_tokens;
-      lda = k, ldb = n, ldc = m_;
-      A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                   m->kProjSize * m->num_q_heads +
-                                   m->vProjSize * m->num_q_heads);
-      B = C;
-      C = static_cast<DT *>(output_ptr) +
-          tokens_previous_requests * m->oProjSize;
-
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_T,
-                              HIPBLAS_OP_T,
-                              m_,
-                              n,
-                              k,
-                              &alpha,
-                              A,
-                              hipblas_data_type,
-                              lda,
-                              B,
-                              hipblas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              hipblas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
-      tokens_previous_requests += num_new_tokens;
-      tokens_prev_requests_squares += num_new_tokens * total_tokens;
+    // all requests in prompt phase should only have one sub requests;
+    assert(bc->sub_requests[i] == 1);
+    // int num_new_tokens = bc->num_processing_tokens[i];
+    // int total_tokens = bc->token_last_available_idx[i] + 1;
+
+    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                       bc->requestsInfo[i].num_tokens_in_batch;
+
+    if (num_new_tokens <= 0) {
+      continue;
     }
-  }
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       num_tokens,
-                       qkv_weight_size,
-                       m->oProjSize);
+
+    // Compute (QK^T/sqrt(d_k))
+    int m_ = num_new_tokens;
+    int n = total_tokens;
+    int k = m->qProjSize;
+    int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
+        ldc = m_;
+    int strideA = q_block_size;
+    int strideB = kt_block_size;
+    int strideC = num_new_tokens * total_tokens;
+
+    // a flag of using this scaling alpha
+    DT alpha = 1.0f, beta = 0.0f;
+    if (*m->qk_prod_scaling) {
+      alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+    }
+    // To get A, skip over Q entries from previous requests (same head)
+    DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                  bc->requestsInfo[i].first_token_offset_in_batch *
+                      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
+    DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+    DT *C = static_cast<DT *>(m->qk_prods);
+
+    checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                          HIPBLAS_OP_T,
+                                          HIPBLAS_OP_N,
+                                          m_,
+                                          n,
+                                          k,
+                                          &alpha,
+                                          A,
+                                          hipblas_data_type,
+                                          lda,
+                                          strideA,
+                                          B,
+                                          hipblas_data_type,
+                                          ldb,
+                                          strideB,
+                                          &beta,
+                                          C,
+                                          hipblas_data_type,
+                                          ldc,
+                                          strideC,
+                                          m->num_q_heads,
+                                          compute_type,
+                                          HIPBLAS_GEMM_DEFAULT));
+
+    if (*m->position_bias) {
+      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd<DT>),
+                         GET_BLOCKS(parallelism),
+                         min((size_t)CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         C,
+                         num_new_tokens,
+                         total_tokens,
+                         m->num_q_heads,
+                         m->global_num_q_heads,
+                         shard_id);
+    }
+    // Fill all elements above diagonal in qk prods with -inf to force
+    // causal attention.
+    assert(num_new_tokens <= total_tokens);
+    if (num_new_tokens > 1) {
+      size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_fill_entries_above_diagonal<DT>),
+                         GET_BLOCKS(parallelism),
+                         min((size_t)CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         C,
+                         num_new_tokens,
+                         total_tokens,
+                         m->num_q_heads,
+                         static_cast<DT>(-INFINITY));
+    }
+    // Compute Softmax(QK^T/sqrt(d_k))
+    // Before modifying the parameters below, make sure to read the following
+    // description of the CUDNN_TENSOR_NCHW tensor layout, from
+    // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
+    // This tensor format specifies that the data is laid out in the following
+    // order: batch size, feature maps, rows, columns. The strides are
+    // implicitly defined in such a way that the data are contiguous in memory
+    // with no padding between images, feature maps, rows, and columns; the
+    // columns are the inner dimension and the images are the outermost
+    // dimension.
+    int n_param = m->num_q_heads;
+    int c_param = total_tokens;
+    int h_param = 1;
+    int w_param = num_new_tokens;
+    checkCUDNN(miopenSet4dTensorDescriptor(
+        m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param));
+    float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+    DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax) +
+                    m->num_q_heads * tokens_prev_requests_squares;
+    // The softmax operation below is executed according to the
+    // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+    // softmax operation is computed per spatial location (H,W) per image (N)
+    // across dimension C.
+    checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
+                                       &softmax_alpha,
+                                       m->qk_tensor,
+                                       C,
+                                       &softmax_beta,
+                                       m->qk_tensor,
+                                       C_softmax,
+                                       MIOPEN_SOFTMAX_ACCURATE,
+                                       MIOPEN_SOFTMAX_MODE_CHANNEL));
+    // Matmul softmax(QK^T/sqrt(d_k)) by V
+    alpha = 1.0f, beta = 0.0f;
+    m_ = m->vProjSize;
+    n = num_new_tokens;
+    k = total_tokens;
+    lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+    strideA = vt_block_size;
+    strideB = num_new_tokens * total_tokens;
+    strideC = m->vProjSize;
+    // To get A, skip over V^T entries from previous requests (all heads +
+    // padding)
+    A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+    // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
+    // requests (all heads)
+    B = C_softmax;
+    // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
+    // requests
+
+    int token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+
+    C = static_cast<DT *>(m->attn_heads) +
+        (token_offset)*m->num_q_heads * m->vProjSize;
+    checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                          HIPBLAS_OP_N,
+                                          HIPBLAS_OP_T,
+                                          m_,
+                                          n,
+                                          k,
+                                          &alpha,
+                                          A,
+                                          hipblas_data_type,
+                                          lda,
+                                          strideA,
+                                          B,
+                                          hipblas_data_type,
+                                          ldb,
+                                          strideB,
+                                          &beta,
+                                          C,
+                                          hipblas_data_type,
+                                          ldc,
+                                          strideC,
+                                          m->num_q_heads,
+                                          compute_type,
+                                          HIPBLAS_GEMM_DEFAULT));
+
+    tokens_previous_requests += num_new_tokens;
+    tokens_prev_requests_squares += num_new_tokens * total_tokens;
   }
 
-  assert(tokens_previous_requests == num_tokens);
+  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+    bc->print();
+    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
+    printf("num_tokens: %i\n", num_tokens);
+    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
+  }
+  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
 }
 
 template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       BeamSearchBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
+                      DT const *qkv_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       hipStream_t stream) {
-  // here because we need postion info in infernece 1
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-  checkCUDA(
-      hipMemcpyAsync(m->token_infos,
-                     &(bc->tokensInfo),
-                     max_tokens_per_batch * sizeof(BatchConfig::PerTokenInfo),
-                     hipMemcpyHostToDevice,
-                     stream));
-  checkCUDA(hipMemcpyAsync(m->request_infos,
-                           &(bc->requestsInfo),
-                           bc->max_requests_per_batch() *
-                               sizeof(BatchConfig::PerRequestInfo),
-                           hipMemcpyHostToDevice,
-                           stream));
-  checkCUDA(
-      hipMemcpyAsync(m->beam_token_infos,
-                     &(bc->beamTokenInfo),
-                     max_tokens_per_batch * bc->MAX_BEAM_WIDTH *
-                         sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo),
-                     hipMemcpyHostToDevice,
-                     stream));
-  checkCUDA(hipMemcpyAsync(
-      m->beam_request_infos,
-      &(bc->beamRequestsInfo),
-      bc->max_requests_per_batch() *
-          sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo),
-      hipMemcpyHostToDevice,
-      stream));
+
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  hipMemcpyAsync(m->devQKVProjArray,
+                 qkv_ptr,
+                 qkv_proj_size *
+                     sizeof(DT), // is this right, do we need layers etc here
+                 hipMemcpyDeviceToDevice,
+                 stream);
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first
+  compute_qkv_kernel(
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
-
+  if (bc->num_generation_tokens > 0) {
+    compute_spec_inc_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  }
   // phase 3: Compute attention score
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  compute_attention_kernel(
-      m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+  if (bc->num_tokens > bc->num_generation_tokens) {
+    compute_attention_kernel_prompt(m, bc, shard_id, output_ptr, stream);
+  }
+
+  int num_tokens = bc->num_active_tokens();
+
+  hipMemcpyAsync(output_ptr,
+                 m->attn_heads,
+                 m->oProjSize * num_tokens * sizeof(DT),
+                 hipMemcpyDeviceToDevice,
+                 stream);
 }
 
-} // namespace SpecIncMultiHeadAttention
+} // namespace SpecIncMultiHeadSelfAttention
 } // namespace Kernels
 
 /*static*/
@@ -521,12 +751,9 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     BeamSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   hipEvent_t t_start, t_end;
   if (m->profiling) {
@@ -535,34 +762,14 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
-  assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::SpecIncMultiHeadAttention::inference_kernel(m,
-                                                         bc,
-                                                         shard_id,
-                                                         input.get_half_ptr(),
-                                                         weight.get_half_ptr(),
-                                                         output.get_half_ptr(),
-                                                         bias_ptr,
-                                                         stream);
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
   } else if (input.data_type == DT_FLOAT) {
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::SpecIncMultiHeadAttention::inference_kernel(m,
-                                                         bc,
-                                                         shard_id,
-                                                         input.get_float_ptr(),
-                                                         weight.get_float_ptr(),
-                                                         output.get_float_ptr(),
-                                                         bias_ptr,
-                                                         stream);
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -581,7 +788,6 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
 SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     SpecIncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _num_q_heads,
@@ -596,14 +802,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
+                                    attn->rotary_embedding_meta,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
                                     attn->position_bias,
-                                    attn->final_bias,
                                     attn->scaling_factor,
-                                    weight,
                                     gpu_mem_allocator,
                                     num_samples,
                                     attn->num_q_heads,
@@ -618,43 +821,16 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-    size_t beam_tokeninfo_size =
-        max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-    size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch();
-    size_t beam_requestinfo_size =
-        BeamSearchBatchConfig::max_requests_per_batch();
-    size_t total_size =
-        requestinfo_size * sizeof(BatchConfig::PerRequestInfo) +
-        beam_tokeninfo_size *
-            sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) +
-        beam_requestinfo_size *
-            sizeof(BeamSearchBatchConfig::
-                       BeamSearchPerRequestInfo); // more components will
-                                                  // be added here later
-
-    // We always directly allocate memory for small speculative models
-    gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst,
-                                             total_size);
     beam_token_infos =
-        gpu_mem_allocator
-            .allocate_instance<BeamSearchBatchConfig::BeamSearchPerTokenInfo>(
-                beam_tokeninfo_size);
-    // offset += beam_tokeninfo_size *
-    //           sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo);
-    request_infos =
-        gpu_mem_allocator.allocate_instance<BatchConfig::PerRequestInfo>(
-            requestinfo_size);
-    // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo);
+        static_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
+            handler.batch_config_metadata->beamTokenInfo);
     beam_request_infos =
-        gpu_mem_allocator
-            .allocate_instance<BeamSearchBatchConfig::BeamSearchPerRequestInfo>(
-                beam_requestinfo_size);
-    // offset += beam_requestinfo_size *
-    //           sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo);
-    // assert(offset == total_size);
-    assert(gpu_mem_allocator.instance_total_size ==
-           gpu_mem_allocator.instance_allocated_size);
+        static_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
+            handler.batch_config_metadata->beamRequestsInfo);
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   checkCUDA(hipStreamSynchronize(stream));
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 4688a8233c..d8a2008388 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -463,8 +463,6 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
                                      BeamSearchBatchConfig const *bc,
                                      int shard_id,
                                      DT *output_ptr,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
                                      cudaStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -472,23 +470,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   cudaDataType_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-  // int num_requests = bc->num_active_requests();
+
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens;
   int q_block_size = m->qProjSize;
 
   int kt_block_size = m->kProjSize;
@@ -568,8 +553,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
                                          m->num_q_heads,
                                          compute_type,
                                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // print_tensor<float>((float*)C, 32, "C");
-    // add alibi position bias to qk production
+
     // add alibi position bias to qk production
     if (*m->position_bias) {
       size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
@@ -698,21 +682,26 @@ template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       BeamSearchBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
+                      DT const *qkv_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       cudaStream_t stream) {
-  // phase 1: Implement kernel to compute KQV for input tokens
 
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size *
+                      sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+  // phase 1: Implement kernel to compute KQV for input tokens
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first
+  compute_qkv_kernel(
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
   if (bc->num_generation_tokens > 0) {
@@ -722,14 +711,16 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   // phase 3: Compute attention score
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
   if (bc->num_tokens > bc->num_generation_tokens) {
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+    compute_attention_kernel_prompt(m, bc, shard_id, output_ptr, stream);
   }
-  // compute output production and bias together for all tokens
+
   int num_tokens = bc->num_active_tokens();
 
-  compute_o_prod_bias(
-      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
@@ -741,12 +732,9 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     BeamSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -755,36 +743,14 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        weight.get_half_ptr(),
-        output.get_half_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
   } else if (input.data_type == DT_FLOAT) {
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -797,16 +763,12 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventDestroy(t_start);
     cudaEventDestroy(t_end);
     printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed);
-    // print_tensor<3, float>(acc_query.ptr, acc_query.rect,
-    // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr,
-    // acc_output.rect, "[Attention:forward:output]");
   }
 }
 
 SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     SpecIncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _num_q_heads,
@@ -821,14 +783,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
+                                    attn->rotary_embedding_meta,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
                                     attn->position_bias,
-                                    attn->final_bias,
                                     attn->scaling_factor,
-                                    weight,
                                     gpu_mem_allocator,
                                     num_samples,
                                     attn->num_q_heads,
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 132a48be40..ae0795ac1e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -61,12 +61,10 @@ Tensor FFModel::inc_multihead_self_attention_verify(
     int kdim,
     int vdim,
     float dropout,
-    bool qkv_bias,
-    bool final_bias,
     bool add_zero_attn,
     DataType data_type,
     Initializer *kernel_initializer,
-    bool apply_rotary_embedding,
+    RotaryEmbeddingMeta rotary_embedding_meta,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -79,12 +77,10 @@ Tensor FFModel::inc_multihead_self_attention_verify(
                                               kdim,
                                               vdim,
                                               dropout,
-                                              qkv_bias,
-                                              final_bias,
                                               add_zero_attn,
                                               data_type,
                                               kernel_initializer,
-                                              apply_rotary_embedding,
+                                              rotary_embedding_meta,
                                               scaling_query,
                                               scaling_factor,
                                               qk_prod_scaling,
@@ -100,12 +96,10 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
     int kdim,
     int vdim,
     float dropout,
-    bool qkv_bias,
-    bool final_bias,
     bool add_zero_attn,
     DataType data_type,
     Initializer *kernel_initializer,
-    bool apply_rotary_embedding,
+    RotaryEmbeddingMeta rotary_embedding_meta,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -117,7 +111,6 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE;
   bool offload = cpu_offload;
   Layer *li = nullptr;
-  int weight_num = (qkv_bias || final_bias) ? 2 : 1;
   if (data_type != input->data_type) {
     Tensor casted_input = cast(input, data_type, "type cast for IncMHA");
     li = new Layer(this,
@@ -125,7 +118,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0,
                    1 /*outputs*/,
                    casted_input);
   } else {
@@ -134,7 +127,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0,
                    1 /*outputs*/,
                    input);
   }
@@ -148,62 +141,28 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
-  // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
-  int one_head_size = qParas + kParas + vParas + oParas;
-  int weight_size = qParas * num_q_heads + kParas * num_q_heads +
-                    vParas * num_q_heads + oParas * num_q_heads;
-  {
-    // compress the weight size if quantization.
-    if (quantization_type != DT_NONE) {
-      one_head_size = get_quantization_to_byte_size(
-          data_type, quantization_type, one_head_size);
-    }
 
-    int dims[1] = {weight_size};
-    li->weights[0] = create_weight_legion_ordering(
-        1,
-        dims,
-        quantization_type == DT_NONE ? data_type : quantization_type,
-        li,
-        true /*create_grad*/,
-        kernel_initializer,
-        CHOSEN_SYNC_TYPE);
-  }
-  if (qkv_bias || final_bias) {
-    // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
-    li->weights[1] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
   li->data_type = data_type;
   li->add_int_property("embed_dim", embed_dim);
   li->add_int_property("num_q_heads", num_q_heads);
   li->add_int_property("num_kv_heads", num_kv_heads);
   li->add_int_property("kdim", kdim);
   li->add_int_property("vdim", vdim);
-  li->add_int_property("qkv_bias", qkv_bias);
-  li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
-  li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
@@ -230,15 +189,20 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
   int vdim = value;
   float dropout;
   layer->get_float_property("dropout", dropout);
-  layer->get_int_property("qkv_bias", value);
-  bool qkv_bias = (bool)value;
-  layer->get_int_property("final_bias", value);
-  bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
-  layer->get_int_property("scaling_query", value);
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   bool scaling_query = (bool)value;
   float scaling_factor;
   layer->get_float_property("scaling_factor", scaling_factor);
@@ -261,15 +225,12 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
                                            kdim,
                                            vdim,
                                            dropout,
-                                           qkv_bias,
-                                           final_bias,
                                            add_zero_attn,
-                                           apply_rotary_embedding,
+                                           rotary_embedding_meta,
                                            scaling_query,
                                            scaling_factor,
                                            qk_prod_scaling,
                                            position_bias,
-                                           false /*allocate_weights*/,
                                            quantization_type,
                                            offload,
                                            tensor_parallelism_degree,
@@ -286,32 +247,27 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     int _kdim,
     int _vdim,
     float _dropout,
-    bool _qkv_bias,
-    bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
     char const *name)
-    // Initializer* _bias_initializer)
     : Op(model,
          OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0,
          1 /*outputs*/,
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
-      qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       qSize(_input->dims[0].size), kSize(_input->dims[0].size),
       vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
@@ -330,63 +286,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     dims[i] = _input->dims[i];
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim
-  assert(dims[0].degree == 1);
-  if (allocate_weights) {
-    // Create weight tensor
-    int num_dims = inputs[0]->num_dims;
-    // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
-    int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
-    ParallelDim dims[2];
-    dims[0] = inputs[0]->dims[num_dims - 2];
-    dims[0].size = dims[0].degree;
-    dims[1] = inputs[0]->dims[num_dims - 1];
-    dims[1].size = this->num_q_heads * (qParas + oParas) +
-                   this->num_q_heads * (kParas + vParas);
-    dims[1].is_replica_dim = false;
-    // dims[2].size = qParas + kParas + vParas + oParas;
-    if (quantization_type != DT_NONE) {
-      dims[1].size = get_quantization_to_byte_size(
-          data_type, quantization_type, dims[1].size);
-    }
-    // dims[2].degree = 1;
-    // dims[2].parallel_idx = -1;
-    int seed = std::rand();
-    Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(
-        dims,
-        quantization_type == DT_NONE ? this->data_type : quantization_type,
-        NULL /*owner_op*/,
-        true /*create_grad*/,
-        initializer,
-        CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
+  // No longer require no parallelism along this dim
+  // assert(dims[0].degree == 1);
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
-  /* for (int i = 0; i < numdim; i++) { */
-  /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-  /* } */
+
   /* // Check correctness */
   /* assert(check_output_input_weight_parallel_dims()); */
 }
@@ -394,40 +299,33 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     const ParallelTensor _input,
-    const ParallelTensor _weight,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
     int _kdim,
     int _vdim,
     float _dropout,
-    bool _qkv_bias,
-    bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
-    bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
     char const *name)
-    // Initializer* _bias_initializer)
     : Op(model,
          OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0,
          1 /*outputs*/,
-         _input,
-         _weight),
+         _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
-      qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       qSize(_input->dims[0].size), kSize(_input->dims[0].size),
       vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
@@ -435,9 +333,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
       quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree)
-// bias_initializer(_bias_initializer)
-{
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   numOutputs = 1;
   int numdim = _input->num_dims;
   ParallelDim dims[MAX_TENSOR_DIM];
@@ -445,64 +341,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     dims[i] = _input->dims[i];
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim
+  // Currently require no parallelism along this dim, is this aligned with the
+  // previous removal of assert?
   assert(dims[0].degree == 1);
-  if (allocate_weights) {
-    // Create weight tensor
-    int num_dims = inputs[0]->num_dims;
-    // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
-    int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
-    ParallelDim dims[2];
-    dims[0] = inputs[0]->dims[num_dims - 2];
-    dims[0].size = dims[0].degree;
-    dims[1] = inputs[0]->dims[num_dims - 1];
-    dims[1].size = this->num_q_heads * (qParas + oParas) +
-                   this->num_q_heads * (kParas + vParas);
-    dims[1].is_replica_dim = false;
-    // dims[2].size = qParas + kParas + vParas + oParas;
-    if (quantization_type != DT_NONE) {
-      dims[1].size = get_quantization_to_byte_size(
-          data_type, quantization_type, dims[1].size);
-    }
-    int seed = std::rand();
-    Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(
-        dims,
-        quantization_type == DT_NONE ? this->data_type : quantization_type,
-        NULL /*owner_op*/,
-        true /*create_grad*/,
-        initializer,
-        CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
 
-  /* for (int i = 0; i < numdim; i++) { */
-  /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-  /* } */
-  /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */
-  /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */
   // Check correctness
   /* assert(check_output_input_weight_parallel_dims()); */
 }
@@ -510,8 +355,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     TreeIncMultiHeadSelfAttention const &other,
-    const ParallelTensor input,
-    bool allocate_weights)
+    const ParallelTensor input)
     : TreeIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
                                     input,
@@ -521,15 +365,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     other.qProjSize,
                                     other.vProjSize,
                                     other.dropout,
-                                    other.qkv_bias,
-                                    other.final_bias,
                                     other.add_zero_attn,
-                                    other.apply_rotary_embedding,
+                                    other.rotary_embedding_meta,
                                     other.scaling_query,
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
                                     other.position_bias,
-                                    allocate_weights,
                                     other.quantization_type,
                                     other.offload,
                                     other.tensor_parallelism_degree,
@@ -539,7 +380,6 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     TreeIncMultiHeadSelfAttentionParams const &params,
     ParallelTensor const &input,
-    bool allocate_weights,
     char const *name)
     : TreeIncMultiHeadSelfAttention(model,
                                     params.layer_guid,
@@ -550,15 +390,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     params.kdim,
                                     params.vdim,
                                     params.dropout,
-                                    params.qkv_bias,
-                                    params.final_bias,
                                     params.add_zero_attn,
-                                    params.apply_rotary_embedding,
+                                    params.rotary_embedding_meta,
                                     params.scaling_query,
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
                                     params.position_bias,
-                                    allocate_weights,
                                     params.quantization_type,
                                     params.offload,
                                     params.tensor_parallelism_degree,
@@ -592,20 +429,12 @@ void TreeIncMultiHeadSelfAttention::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -633,18 +462,12 @@ void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -652,8 +475,7 @@ void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) {
 
 /*
   regions[0](I): input
-  regions[1](I): weight
-  regions[2](O): output
+  regions[1](O): output
 */
 OpMeta *TreeIncMultiHeadSelfAttention::init_task(
     Task const *task,
@@ -671,17 +493,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR weight =
-      helperGetGenericTensorAccessorRO(attn->weights[0]->data_type,
-                                       regions[1],
-                                       task->regions[1],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type,
-                                       regions[2],
-                                       task->regions[2],
+                                       regions[1],
+                                       task->regions[1],
                                        FID_DATA,
                                        ctx,
                                        runtime);
@@ -689,14 +504,12 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1;
   assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
-  // int num_q_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1;
+
   int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree;
   int num_kv_heads =
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
-
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   if (attn->offload) {
@@ -705,14 +518,8 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
     gpu_mem_allocator.register_reserved_work_space(
         handle.offload_reserve_space, handle.offload_reserve_space_size);
   }
-  TreeIncMultiHeadSelfAttentionMeta *m =
-      new TreeIncMultiHeadSelfAttentionMeta(handle,
-                                            attn,
-                                            weight,
-                                            gpu_mem_allocator,
-                                            num_samples,
-                                            num_q_heads,
-                                            num_kv_heads);
+  TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta(
+      handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads);
   if (!attn->offload) {
     // assert that we didn't over allocate memory
     assert(gpu_mem_allocator.reserved_allocated_size ==
@@ -723,10 +530,6 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
 
-  if (attn->quantization_type == DT_NONE) {
-    assert(weight.domain.get_volume() * data_type_size(weight.data_type) ==
-           m->weightSize);
-  }
   return m;
 }
 
@@ -764,37 +567,18 @@ FutureMap TreeIncMultiHeadSelfAttention::inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(
-        RegionRequirement(weights[1]->part,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          weights[1]->region,
-                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-    launcher.add_field(idx++, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
   regions[0](I): input
-  regions[3](I): weight
-  regions[4](O): output
+  regions[1](O): output
 */
 void TreeIncMultiHeadSelfAttention::inference_task(
     Task const *task,
@@ -815,37 +599,19 @@ void TreeIncMultiHeadSelfAttention::inference_task(
 
   TreeIncMultiHeadSelfAttentionMeta *m =
       *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args);
-  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
-                                           : regions.size() == 3));
+  assert(regions.size() == 2);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR biases;
-  if (*m->qkv_bias || *m->final_bias) {
-    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                              regions[3],
-                                              task->regions[3],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime);
-    Domain bias_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    assert(bias_domain.get_dim() == 4);
-  }
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain weight_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[1].region.get_index_space());
 
   assert(input_domain.get_dim() == 4);
-  assert(weight_domain.get_dim() == 2);
   assert(output_domain.get_dim() == 4);
 
   /* print_tensor<float>(input.get_float_ptr(),
@@ -855,18 +621,13 @@ void TreeIncMultiHeadSelfAttention::inference_task(
   assert(task->index_point.get_dim() == 1);
 
   TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, &bc, task->index_point.point_data[0], input, output);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(weight);
-    if (*m->qkv_bias || *m->final_bias) {
-      weights_accessors.push_back(biases);
-    }
     TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, &bc, {input}, {}, {output});
   }
 }
 
@@ -896,9 +657,20 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs,
   return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim &&
          lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim &&
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
-         lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -915,10 +687,8 @@ TreeIncMultiHeadSelfAttentionParams
   params.kdim = this->kProjSize;
   params.vdim = this->vProjSize;
   params.dropout = this->dropout;
-  params.qkv_bias = this->qkv_bias;
-  params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -943,10 +713,15 @@ size_t hash<FlexFlow::TreeIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.kdim);
   hash_combine(key, params.vdim);
   hash_combine(key, params.dropout);
-  hash_combine(key, params.qkv_bias);
-  hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 890d32bc87..50e2311ca8 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -17,7 +17,6 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
-#include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
@@ -519,300 +518,6 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix,
   }
 }
 
-template <typename DT>
-void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              TreeVerifyBatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              hipStream_t stream) {
-  checkCUDA(hipblasSetStream(m->handle.blas, stream));
-  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
-  hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-  hipblasDatatype_t compute_type = hipblas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   hipblasDatatype_t compute_type = hipblas_data_type;
-  // #else
-  //   // TODO: currently use the hipblas_data_type
-  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   hipblasDatatype_t compute_type = hipblas_data_type;
-  // #endif
-  // int num_requests = bc->num_active_requests();
-  int processed_tokens_in_batch = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    assert(processed_tokens_in_batch ==
-           bc->requestsInfo[i].first_token_offset_in_batch);
-    int last_token_idx_of_the_request =
-        processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
-    while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
-      int num_new_tokens = 1;
-      int j = processed_tokens_in_batch;
-      while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_depth_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_depth_in_request)) {
-        j++;
-        num_new_tokens++;
-      }
-
-      int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1;
-      assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
-      {
-        // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(update_tree_branch_kv_cache<DT>),
-            GET_BLOCKS(parallelism),
-            min(CUDA_NUM_THREADS, parallelism),
-            0,
-            stream,
-            static_cast<DT *>(m->devQKVProjArray),
-            static_cast<DT *>(m->keyCache),
-            static_cast<DT *>(m->valueCache),
-            m->token_infos,
-            m->qProjSize,
-            m->kProjSize,
-            m->vProjSize,
-            num_new_tokens,            // num_tokens_in_branch
-            processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_infr_tokens, // total_tokens_in_batch
-            BatchConfig::max_sequence_length(),
-            m->hidden_size);
-      }
-
-      // bc->token_last_available_idx[i] + 1;
-      // Compute (QK^T/sqrt(d_k))
-      int m_ = num_new_tokens;
-      int n = total_tokens_in_request;
-      int k = m->qProjSize;
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens_in_request;
-
-      // a flag of using this scaling alpha
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // To get A, skip over Q entries from previous requests (same head)
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    processed_tokens_in_batch * m->qProjSize * m->num_q_heads *
-                        QKV_WEIGHT_NUM;
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // To get C, skip over QK^T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-
-      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_T,
-                                            HIPBLAS_OP_N,
-                                            m_,
-                                            n,
-                                            k,
-                                            &alpha,
-                                            A,
-                                            hipblas_data_type,
-                                            lda,
-                                            strideA,
-                                            B,
-                                            hipblas_data_type,
-                                            ldb,
-                                            strideB,
-                                            &beta,
-                                            C,
-                                            hipblas_data_type,
-                                            ldc,
-                                            strideC,
-                                            m->num_q_heads,
-                                            compute_type,
-                                            HIPBLAS_GEMM_DEFAULT));
-
-      if (*m->position_bias) {
-        size_t parallelism =
-            m->num_q_heads * total_tokens_in_request * num_new_tokens;
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd<DT>),
-                           GET_BLOCKS(parallelism),
-                           min((size_t)CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream,
-                           C,
-                           num_new_tokens,
-                           total_tokens_in_request,
-                           m->num_q_heads,
-                           m->global_num_q_heads,
-                           shard_id);
-      }
-
-      // Fill all elements above diagonal in qk prods with -inf to force
-      // causal attention.
-      assert(num_new_tokens <= total_tokens_in_request);
-      if (num_new_tokens > 1) {
-        size_t parallelism =
-            m->num_q_heads * num_new_tokens * total_tokens_in_request;
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(tree_fill_entries_above_diagonal<DT>),
-            GET_BLOCKS(parallelism),
-            min((size_t)CUDA_NUM_THREADS, parallelism),
-            0,
-            stream,
-            C,
-            num_new_tokens,
-            total_tokens_in_request,
-            m->num_q_heads,
-            static_cast<DT>(-INFINITY));
-      }
-      // Compute Softmax(QK^T/sqrt(d_k))
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens_in_request;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(miopenSet4dTensorDescriptor(
-          m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
-                                         &softmax_alpha,
-                                         m->qk_tensor,
-                                         C,
-                                         &softmax_beta,
-                                         m->qk_tensor,
-                                         C_softmax,
-                                         MIOPEN_SOFTMAX_ACCURATE,
-                                         MIOPEN_SOFTMAX_MODE_CHANNEL));
-      // Matmul softmax(QK^T/sqrt(d_k)) by V
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->vProjSize;
-      n = num_new_tokens;
-      k = total_tokens_in_request;
-      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      strideA = vt_block_size;
-      strideB = num_new_tokens * total_tokens_in_request;
-      strideC = m->vProjSize;
-      // To get A, skip over V^T entries from previous requests (all heads +
-      // padding)
-      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      B = C_softmax;
-      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-      // requests
-      C = static_cast<DT *>(m->attn_heads) +
-          processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                            HIPBLAS_OP_N,
-                                            HIPBLAS_OP_T,
-                                            m_,
-                                            n,
-                                            k,
-                                            &alpha,
-                                            A,
-                                            hipblas_data_type,
-                                            lda,
-                                            strideA,
-                                            B,
-                                            hipblas_data_type,
-                                            ldb,
-                                            strideB,
-                                            &beta,
-                                            C,
-                                            hipblas_data_type,
-                                            ldc,
-                                            strideC,
-                                            m->num_q_heads,
-                                            compute_type,
-                                            HIPBLAS_GEMM_DEFAULT));
-      processed_tokens_in_batch += num_new_tokens;
-    }
-    // Before moving to the next request
-    // check that we have finished all tokens of the request
-    assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
-  }
-  // Project to output, save result directly on output tensor
-  DT alpha = 1.0f, beta = 0.0f;
-  int m_ = m->oProjSize;
-  int k = m->vProjSize * m->num_q_heads;
-  int n = processed_tokens_in_batch;
-  int lda = k, ldb = k, ldc = m_;
-  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                         m->kProjSize * m->num_q_heads +
-                                         m->vProjSize * m->num_q_heads);
-  DT const *B = static_cast<DT *>(m->attn_heads);
-  DT *C = static_cast<DT *>(output_ptr);
-
-  checkCUDA(hipblasGemmEx(m->handle.blas,
-                          HIPBLAS_OP_T,
-                          HIPBLAS_OP_T,
-                          m_,
-                          n,
-                          k,
-                          &alpha,
-                          A,
-                          hipblas_data_type,
-                          lda,
-                          B,
-                          hipblas_data_type,
-                          ldb,
-                          &beta,
-                          C,
-                          hipblas_data_type,
-                          ldc,
-                          compute_type,
-                          HIPBLAS_GEMM_DEFAULT));
-
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * processed_tokens_in_batch;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       processed_tokens_in_batch,
-                       qkv_weight_size,
-                       m->oProjSize);
-  }
-
-  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
-}
-
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
     DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream)      \
   smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
@@ -895,27 +600,10 @@ template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       TreeVerifyBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
+                      DT const *qkv_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       hipStream_t stream) {
-  // additional processing for weight uploading
-  if (m->handle.offload_reserve_space != nullptr) {
-    // Note that we update weight_ptr and bias_ptr when uploading weight and
-    // bias
-    checkCUDA(hipMemcpyAsync(m->weight_ptr,
-                             weight_ptr,
-                             m->weightSize,
-                             hipMemcpyHostToDevice,
-                             stream));
-    weight_ptr = static_cast<DT *>(m->weight_ptr);
-    if (m->biasSize > 0) {
-      checkCUDA(hipMemcpyAsync(
-          m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
-      bias_ptr = static_cast<DT *>(m->bias_ptr);
-    }
-  }
+
   // copy committed tokens info to GPU for the commit_tokens kernel
   // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
@@ -929,39 +617,36 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // tokens for the current batch
   m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
-  // here because we need postion info in infernece 1
-  if (m->offload && m->biasSize > 0) {
-    checkCUDA(hipMemcpyAsync(
-        m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
-    bias_ptr = static_cast<DT *>(m->bias_ptr);
-  }
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  hipMemcpyAsync(m->devQKVProjArray,
+                 qkv_ptr,
+                 qkv_proj_size *
+                     sizeof(DT), // is this right, do we need layers etc here
+                 hipMemcpyDeviceToDevice,
+                 stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first
+  compute_qkv_kernel(
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
 
   // phase 2: No need to update key/val cache
-  // IncMultiHeadSelfAttention::update_kv_cache_kernel(
-  //    m, bc, stream);
-  // use the new kernel
   compute_attention_kernel_fused<DT>(
       m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
-  compute_o_prod_bias(m,
-                      bc,
-                      shard_id,
-                      output_ptr,
-                      weight_ptr,
-                      bias_ptr,
-                      processed_tokens_in_batch,
-                      stream);
+  int num_tokens = bc->num_active_tokens();
+  hipMemcpyAsync(output_ptr,
+                 m->attn_heads,
+                 m->oProjSize * num_tokens * sizeof(DT),
+                 hipMemcpyDeviceToDevice,
+                 stream);
 }
 
 } // namespace TreeIncMultiHeadAttention
@@ -973,12 +658,9 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeVerifyBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   hipEvent_t t_start, t_end;
   if (m->profiling) {
@@ -987,44 +669,14 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
-    }
-
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
-        output.get_half_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
   } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -1037,16 +689,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     checkCUDA(hipEventDestroy(t_start));
     checkCUDA(hipEventDestroy(t_end));
     printf("TreeIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed);
-    // print_tensor<3, float>(acc_query.ptr, acc_query.rect,
-    // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr,
-    // acc_output.rect, "[Attention:forward:output]");
   }
 }
 
 TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     TreeIncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _num_q_heads,
@@ -1061,14 +709,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
+                                    attn->rotary_embedding_meta,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
                                     attn->position_bias,
-                                    attn->final_bias,
                                     attn->scaling_factor,
-                                    weight,
                                     gpu_mem_allocator,
                                     num_samples,
                                     attn->num_q_heads,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 86c53d7ea1..8c643b1964 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -494,303 +494,6 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix,
   }
 }
 
-template <typename DT>
-void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              TreeVerifyBatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-  cudaDataType_t compute_type = cublas_data_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = cublas_data_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-  // int num_requests = bc->num_active_requests();
-  int processed_tokens_in_batch = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    assert(processed_tokens_in_batch ==
-           bc->requestsInfo[i].first_token_offset_in_batch);
-    int last_token_idx_of_the_request =
-        processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
-    while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
-      int num_new_tokens = 1;
-      int j = processed_tokens_in_batch;
-      while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_depth_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_depth_in_request)) {
-        j++;
-        num_new_tokens++;
-      }
-
-      int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1;
-      assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
-      {
-        // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
-        update_tree_branch_kv_cache<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-            static_cast<DT *>(m->devQKVProjArray),
-            static_cast<DT *>(m->keyCache),
-            static_cast<DT *>(m->valueCache),
-            m->token_infos,
-            m->qProjSize,
-            m->kProjSize,
-            m->vProjSize,
-            num_new_tokens,            // num_tokens_in_branch
-            processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_infr_tokens, // total_tokens_in_batch
-            BatchConfig::max_sequence_length(),
-            m->hidden_size);
-      }
-
-      // bc->token_last_available_idx[i] + 1;
-      // Compute (QK^T/sqrt(d_k))
-      int m_ = num_new_tokens;
-      int n = total_tokens_in_request;
-      int k = m->qProjSize;
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens_in_request;
-
-      // a flag of using this scaling alpha
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // To get A, skip over Q entries from previous requests (same head)
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    processed_tokens_in_batch * m->qProjSize * m->num_q_heads *
-                        QKV_WEIGHT_NUM;
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // To get C, skip over QK^T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // add alibi position bias to qk production
-      // add alibi position bias to qk production
-      if (*m->position_bias) {
-        size_t parallelism =
-            m->num_q_heads * total_tokens_in_request * num_new_tokens;
-        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens_in_request,
-                                              m->num_q_heads,
-                                              m->global_num_q_heads,
-                                              shard_id);
-      }
-
-      // Fill all elements above diagonal in qk prods with -inf to force
-      // causal attention.
-      assert(num_new_tokens <= total_tokens_in_request);
-      if (num_new_tokens > 1) {
-        size_t parallelism =
-            m->num_q_heads * num_new_tokens * total_tokens_in_request;
-        tree_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                           min((size_t)CUDA_NUM_THREADS,
-                                               parallelism),
-                                           0,
-                                           stream>>>(
-            C,
-            num_new_tokens,
-            total_tokens_in_request,
-            m->num_q_heads,
-            static_cast<DT>(-INFINITY));
-      }
-      // Compute Softmax(QK^T/sqrt(d_k))
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens_in_request;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-      // Matmul softmax(QK^T/sqrt(d_k)) by V
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->vProjSize;
-      n = num_new_tokens;
-      k = total_tokens_in_request;
-      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      strideA = vt_block_size;
-      strideB = num_new_tokens * total_tokens_in_request;
-      strideC = m->vProjSize;
-      // To get A, skip over V^T entries from previous requests (all heads +
-      // padding)
-      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      B = C_softmax;
-      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-      // requests
-      C = static_cast<DT *>(m->attn_heads) +
-          processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      processed_tokens_in_batch += num_new_tokens;
-    }
-    // Before moving to the next request
-    // check that we have finished all tokens of the request
-    assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
-  }
-  // Project to output, save result directly on output tensor
-  DT alpha = 1.0f, beta = 0.0f;
-  int m_ = m->oProjSize;
-  int k = m->vProjSize * m->num_q_heads;
-  int n = processed_tokens_in_batch;
-  int lda = k, ldb = k, ldc = m_;
-  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                         m->kProjSize * m->num_q_heads +
-                                         m->vProjSize * m->num_q_heads);
-  DT const *B = static_cast<DT *>(m->attn_heads);
-  DT *C = static_cast<DT *>(output_ptr);
-
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         m_,
-                         n,
-                         k,
-                         &alpha,
-                         A,
-                         cublas_data_type,
-                         lda,
-                         B,
-                         cublas_data_type,
-                         ldb,
-                         &beta,
-                         C,
-                         cublas_data_type,
-                         ldc,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * processed_tokens_in_batch;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(output_ptr,
-                                  bias_ptr,
-                                  processed_tokens_in_batch,
-                                  qkv_weight_size,
-                                  m->oProjSize);
-  }
-
-  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
-}
-
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
     DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream)      \
   smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
@@ -873,27 +576,9 @@ template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       TreeVerifyBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
+                      DT const *qkv_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       cudaStream_t stream) {
-  // additional processing for weight uploading
-  if (m->handle.offload_reserve_space != nullptr) {
-    // Note that we update weight_ptr and bias_ptr when uploading weight and
-    // bias
-    cudaMemcpyAsync(m->weight_ptr,
-                    weight_ptr,
-                    m->weightSize,
-                    cudaMemcpyHostToDevice,
-                    stream);
-    weight_ptr = static_cast<DT *>(m->weight_ptr);
-    if (m->biasSize > 0) {
-      cudaMemcpyAsync(
-          m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
-      bias_ptr = static_cast<DT *>(m->bias_ptr);
-    }
-  }
 
   // copy committed tokens info to GPU for the commit_tokens kernel
   // Note that m->num_active_infr_tokens stores the number of active
@@ -908,39 +593,36 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // tokens for the current batch
   m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
-  // here because we need postion info in infernece 1
-  if (m->offload && m->biasSize > 0) {
-    cudaMemcpyAsync(
-        m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
-    bias_ptr = static_cast<DT *>(m->bias_ptr);
-  }
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size =
+      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size *
+                      sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  // TODO WARNING: this is commented out only because we are fixing the inc_attn
+  // first
+  compute_qkv_kernel(
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
 
   // phase 2: No need to update key/val cache
-  // IncMultiHeadSelfAttention::update_kv_cache_kernel(
-  //    m, bc, stream);
-  // use the new kernel
   compute_attention_kernel_fused<DT>(
       m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
-  compute_o_prod_bias(m,
-                      bc,
-                      shard_id,
-                      output_ptr,
-                      weight_ptr,
-                      bias_ptr,
-                      processed_tokens_in_batch,
-                      stream);
+  int num_tokens = bc->num_active_tokens();
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 }
 
 } // namespace TreeIncMultiHeadAttention
@@ -952,12 +634,9 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeVerifyBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -966,44 +645,14 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
-    }
-
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
-        output.get_half_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
   } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
+        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -1021,7 +670,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     TreeIncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
     MemoryAllocator &gpu_mem_allocator,
     int num_samples,
     int _num_q_heads,
@@ -1036,14 +684,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
+                                    attn->rotary_embedding_meta,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
                                     attn->position_bias,
-                                    attn->final_bias,
                                     attn->scaling_factor,
-                                    weight,
                                     gpu_mem_allocator,
                                     num_samples,
                                     attn->num_q_heads,
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index dc43d80133..a4443c4066 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -73,7 +73,7 @@ AllReduce::AllReduce(FFModel &model,
   for (int i = 0; i < numdim; i++) {
     dims[i] = _input->dims[i];
   }
-  assert(dims[allreduce_dim].degree > 1);
+  // assert(dims[allreduce_dim].degree > 1);
   // ParallelTensorBase::update_parallel_ids(numdim, dims);
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       numdim, dims, _input->data_type, this);
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index c373e0da9b..e73893475c 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -80,51 +80,56 @@ std::string removeGuidOperatorName(std::string const &input) {
 }
 
 template <typename DT>
-void load_attention_weights_multi_query(DT *ptr,
-                                        std::string layer_name,
-                                        std::string weights_folder,
-                                        size_t hidden_dim,
-                                        int num_heads) {
-
-  std::string qkv_file = layer_name.substr(0, layer_name.find("attention")) +
-                         "attention_query_key_value_weight";
-  std::string o_file = layer_name.substr(0, layer_name.find("attention")) +
-                       "attention_dense_weight";
+void load_attention_o_proj_bias_to_dense_v2(DT *ptr,
+                                            int num_heads,
+                                            int num_kv_heads,
+                                            size_t hidden_dim,
+                                            size_t qkv_inner_dim,
+                                            std::string layer_name,
+                                            std::string weights_folder) {
+  std::string filename = layer_name + ".o_proj.bias";
 
-  // q has n_heads heads, k and v only have one head, o have n_head heads
-  std::vector<std::string> weight_filenames = {qkv_file, o_file};
   int file_index = 0;
-  int data_index = 0;
-  for (auto filename : weight_filenames) {
-    std::cout << "Loading weight file " << filename << std::endl;
-    std::string weight_filepath = join_path({weights_folder, filename});
-    size_t partial_size =
-        file_index == 0 ? (hidden_dim + 2 * hidden_dim / num_heads) * hidden_dim
-                        : hidden_dim * hidden_dim;
 
-    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
-    // std::cout << "Loading filename: " << weight_filepath << std::endl;
-    if (!in.good()) {
-      std::cout << "Could not open file: " << weight_filepath << std::endl;
-    }
-    assert(in.good() && "incorrect weight file path");
-    std::vector<DT> host_array(partial_size);
-    size_t loaded_data_size = sizeof(DT) * partial_size;
-    in.seekg(0, in.end);
-    in.seekg(0, in.beg);
-    in.read((char *)host_array.data(), loaded_data_size);
-    size_t in_get_size = in.gcount();
+  // now only opt use this.
+  // assert(num_heads == num_kv_heads);
+  int idx = 0;
 
-    if (in_get_size != loaded_data_size) {
-      std::cout << "load data error " << in_get_size << ", "
-                << loaded_data_size;
-      assert(false && "data size mismatch");
-    }
-    for (int i = 0; i < partial_size; i++) {
-      ptr[data_index++] = host_array.at(i);
-    }
-    file_index++;
+  std::cout << "Loading weight file " << filename << std::endl;
+  std::string weight_filepath = join_path({weights_folder, filename});
+
+  int n_heads = num_heads;
+
+  int replicate_num = num_heads / num_kv_heads;
+
+  size_t out_partial_size = hidden_dim;
+  size_t partial_size = out_partial_size;
+  std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+  assert(in.good() && "incorrect bias file path");
+  std::vector<DT> host_array(partial_size);
+  size_t loaded_data_size = sizeof(DT) * partial_size;
+  in.seekg(0, in.end);
+  in.seekg(0, in.beg);
+  in.read((char *)host_array.data(), loaded_data_size);
+  size_t in_get_size = in.gcount();
+
+  if (in_get_size != loaded_data_size) {
+    printf(
+        "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n",
+        in_get_size,
+        loaded_data_size);
+    assert(false);
   }
+  assert(partial_size == host_array.size());
+
+  size_t data_index = 0;
+
+  for (int i = 0; i < partial_size; i++) {
+    ptr[i] = host_array.at(data_index);
+    data_index++;
+  }
+
+  in.close();
 }
 
 template <typename DT>
@@ -135,44 +140,53 @@ void load_attention_bias_v2(DT *ptr,
                             size_t qkv_inner_dim,
                             bool final_bias,
                             std::string layer_name,
-                            std::string weights_folder) {
+                            std::string weights_folder,
+                            int tp_degree) {
   std::string q_file = layer_name + ".q_proj.bias";
   std::string k_file = layer_name + ".k_proj.bias";
   std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
-  if (final_bias) {
-    std::string o_file = layer_name + ".o_proj.bias";
-    bias_files.push_back(o_file);
-  }
 
-  int file_index = 0;
-
-  // now only opt use this.
-  // assert(num_heads == num_kv_heads);
-  int idx = 0;
+  // linear layer weights: [output_size, input_size]
+  // bias layer weights: [output_size]
+  // Q,K,V projection weights: [head_dim*num_heads, hidden_size] = [768, 768]
+  // QKV bias weights: [head_dim*num_heads] = [768], organized as: [head_dim_0,
+  // head_dim_1, ...]
+
+  // need to rearrange: [[q_heads_shard_0], [k_heads_shard_0],
+  // [v_heads_shard_0], ..., [q_heads_shard_n], [k_heads_shard_n],
+  // [v_heads_shard_n]] where n = tp_degree
+  assert(num_heads % tp_degree == 0);
+  assert(num_kv_heads % tp_degree == 0);
+  assert(hidden_dim % num_heads == 0);
+  assert(qkv_inner_dim == hidden_dim / num_heads);
+  size_t q_heads_per_shard = num_heads / tp_degree;
+  size_t kv_heads_per_shard = num_kv_heads / tp_degree;
+  size_t shard_chunk_size =
+      (q_heads_per_shard + 2 * kv_heads_per_shard) * qkv_inner_dim;
 
+  int file_index = 0;
   for (auto filename : bias_files) {
     std::cout << "Loading weight file " << filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, filename});
 
     int n_heads = file_index == 0 ? num_heads : num_kv_heads;
-
-    int replicate_num = num_heads / num_kv_heads;
-
-    size_t qkv_partial_size = qkv_inner_dim * n_heads;
-    size_t qkv_replicate_size = qkv_inner_dim * num_heads;
-    size_t out_partial_size = hidden_dim;
-    size_t partial_size =
-        (file_index < 3) ? qkv_partial_size : out_partial_size;
+    assert(n_heads % tp_degree == 0);
+    int heads_per_shard = n_heads / tp_degree;
+    int qkv_prev_heads_cur_shard =
+        (file_index == 2) ? num_heads + num_kv_heads : file_index * num_heads;
+    assert(qkv_prev_heads_cur_shard % tp_degree == 0);
+    qkv_prev_heads_cur_shard /= tp_degree;
+
+    // load into memory first
+    size_t bias_size = qkv_inner_dim * n_heads;
     std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
     assert(in.good() && "incorrect bias file path");
-    std::vector<DT> host_array(partial_size);
-    size_t loaded_data_size = sizeof(DT) * partial_size;
-    in.seekg(0, in.end);
+    std::vector<DT> host_array(bias_size);
+    size_t loaded_data_size = sizeof(DT) * bias_size;
     in.seekg(0, in.beg);
     in.read((char *)host_array.data(), loaded_data_size);
     size_t in_get_size = in.gcount();
-
     if (in_get_size != loaded_data_size) {
       printf(
           "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n",
@@ -180,43 +194,37 @@ void load_attention_bias_v2(DT *ptr,
           loaded_data_size);
       assert(false);
     }
-    assert(partial_size == host_array.size());
-
-    size_t data_index = 0;
-
-    // q, o
-    if (file_index == 0 || file_index == 3) {
-      for (int i = 0; i < partial_size; i++) {
-        ptr[idx + i] = host_array.at(data_index);
-        data_index++;
-      }
-    } else {
-      // k, v
-      for (int i = 0; i < partial_size; i++) {
-        for (int j = 0; j < replicate_num; j++) {
-          ptr[idx + j * partial_size + i] = host_array.at(data_index);
-        }
-        data_index++;
+    assert(bias_size == host_array.size());
+
+    // now copy chunks into ptr
+    for (int i = 0; i < n_heads; i++) {
+      int shard_idx = i / heads_per_shard;
+      for (int j = 0; j < qkv_inner_dim; j++) {
+        int src_idx = i * qkv_inner_dim + j;
+        int dst_idx = shard_idx * shard_chunk_size +
+                      qkv_prev_heads_cur_shard * qkv_inner_dim +
+                      (i % heads_per_shard) * qkv_inner_dim + j;
+        ptr[dst_idx] = host_array.at(src_idx);
       }
     }
-
     file_index++;
-    idx += qkv_replicate_size;
-
     in.close();
   }
 }
 
 template <typename DT>
-void load_attention_weights_v2(DT *ptr,
-                               int num_heads,
-                               int num_kv_heads,
-                               size_t hidden_dim,
-                               size_t qkv_inner_dim,
-                               std::string layer_name,
-                               std::string weights_folder,
-                               size_t volume,
-                               int tensor_parallelism_degree) {
+void load_attention_weights_to_dense_v2(DT *ptr,
+                                        int num_heads,
+                                        int num_kv_heads,
+                                        size_t hidden_dim,
+                                        size_t qkv_inner_dim,
+                                        std::string layer_name,
+                                        std::string weights_folder,
+                                        size_t volume,
+                                        int tensor_parallelism_degree,
+                                        bool load_o_proj) {
+  // layers_0_attention_wq_weight
+  // layers_0_self_attn_q_proj_weight
   std::string q_file = layer_name + ".q_proj.weight";
   std::string k_file = layer_name + ".k_proj.weight";
   std::string v_file = layer_name + ".v_proj.weight";
@@ -241,64 +249,64 @@ void load_attention_weights_v2(DT *ptr,
   int replicate_num = num_heads / num_kv_heads;
 
   // stride for q, k, v, o
-  size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) /
+  size_t stride_size = (q_size + v_replicate_size + k_replicate_size) /
                        tensor_parallelism_degree;
-  for (auto filename : weight_filenames) {
-    std::cout << "Loading weight file " << filename << std::endl;
-    std::string weight_filepath = join_path({weights_folder, filename});
-
-    int data_index = 0;
-    size_t partial_size = (file_index == 0 || file_index == 3)
-                              ? one_weight_file_size
-                              : single_proj_size * num_kv_heads;
-    size_t one_partition_size =
-        one_weight_file_size / tensor_parallelism_degree;
-
-    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
-    if (!in.good()) {
-      std::cout << "Could not open file: " << weight_filepath << std::endl;
-    }
-    assert(in.good() && "incorrect weight file path");
-    std::vector<DT> host_array(partial_size);
-    size_t loaded_data_size = sizeof(DT) * partial_size;
-    in.seekg(0, in.end);
-    in.seekg(0, in.beg);
-    in.read((char *)host_array.data(), loaded_data_size);
-    size_t in_get_size = in.gcount();
+  if (!load_o_proj) {
+    for (auto filename : weight_filenames) {
+      std::cout << "Loading weight file " << filename << std::endl;
+      std::string weight_filepath = join_path({weights_folder, filename});
+
+      int data_index = 0;
+      size_t partial_size = (file_index == 0 || file_index == 3)
+                                ? one_weight_file_size
+                                : single_proj_size * num_kv_heads;
+      size_t one_partition_size =
+          one_weight_file_size / tensor_parallelism_degree;
+
+      std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+      if (!in.good()) {
+        std::cout << "Could not open file: " << weight_filepath << std::endl;
+      }
+      assert(in.good() && "incorrect weight file path");
+      std::vector<DT> host_array(partial_size);
+      size_t loaded_data_size = sizeof(DT) * partial_size;
+      in.seekg(0, in.end);
+      in.seekg(0, in.beg);
+      in.read((char *)host_array.data(), loaded_data_size);
+      size_t in_get_size = in.gcount();
 
-    if (in_get_size != loaded_data_size) {
-      std::cout << "load attention data error " << in_get_size << ", "
-                << loaded_data_size << ", " << file_index << ", "
-                << weight_filepath << "\n";
-      assert(false && "data size mismatch");
-    }
-    // wq, wk, wo
-    if (file_index == 0) {
-      for (int i = 0; i < tensor_parallelism_degree; i++) {
-        for (int j = 0; j < one_partition_size; j++) {
-          ptr[base_index + i * stride_size + j] = host_array.at(data_index++);
-        }
+      if (in_get_size != loaded_data_size) {
+        std::cout << "load attention data error " << in_get_size << ", "
+                  << loaded_data_size << ", " << file_index << ", "
+                  << weight_filepath << "\n";
+        assert(false && "data size mismatch");
       }
-    } else {
-      for (int i = 0; i < num_heads; i++) {
-        int kv_idx = i / (num_heads / num_kv_heads);
-        int head_idx = i % (num_heads / tensor_parallelism_degree);
-        int tp_idx = (i / (num_heads / tensor_parallelism_degree));
-        for (int j = 0; j < single_proj_size; j++) {
-          ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx +
-              j] = host_array.at(kv_idx * single_proj_size + j);
+      // wq, wk, wo
+      if (file_index == 0) {
+        for (int i = 0; i < tensor_parallelism_degree; i++) {
+          for (int j = 0; j < one_partition_size; j++) {
+            ptr[base_index + i * stride_size + j] = host_array.at(data_index++);
+          }
+        }
+      } else {
+        for (int i = 0; i < num_heads; i++) {
+          int kv_idx = i / (num_heads / num_kv_heads);
+          int head_idx = i % (num_heads / tensor_parallelism_degree);
+          int tp_idx = (i / (num_heads / tensor_parallelism_degree));
+          for (int j = 0; j < single_proj_size; j++) {
+            ptr[base_index + tp_idx * stride_size +
+                single_proj_size * head_idx + j] =
+                host_array.at(kv_idx * single_proj_size + j);
+          }
         }
       }
+      // std::cout << "host array going out of scope, releasing" << endl;
+      base_index += one_partition_size;
+      file_index++;
     }
-
-    // assert(data_index == partial_size);
-    base_index += one_partition_size;
-    file_index++;
-  }
-  assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
-                           tensor_parallelism_degree);
-
-  {
+    assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
+                             tensor_parallelism_degree);
+  } else {
     std::cout << "Loading weight file " << o_file << std::endl;
     std::string weight_filepath = join_path({weights_folder, o_file});
 
@@ -314,6 +322,15 @@ void load_attention_weights_v2(DT *ptr,
     in.read((char *)host_array.data(), loaded_data_size);
     size_t in_get_size = in.gcount();
 
+    DT temp;
+
+    for (int i = 0; i < one_weight_file_size; i++) {
+      temp = host_array.at(i);
+    }
+
+    // std::cout<<"o_proj loaded into host array, total size:
+    // "<<one_weight_file_size<<std::endl;
+
     if (in_get_size != loaded_data_size) {
       std::cout << "load data error" << std::endl;
       assert(false);
@@ -321,20 +338,24 @@ void load_attention_weights_v2(DT *ptr,
     assert(one_weight_file_size == host_array.size());
     int data_index = 0;
 
+    // std::cout<<"read data size checked"<<std::endl;
+
+    for (int i = 0; i < one_weight_file_size; i++) {
+      ptr[i] = temp;
+    }
+
+    // std::cout<<"ptr allocation good"<<std::endl;
+
     int one_partition_size =
         qkv_inner_dim * (num_heads / tensor_parallelism_degree);
     for (int i = 0; i < one_weight_file_size; i++) {
-      int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
-      int block_num = (i / one_partition_size);
-      int offset = block_num / tensor_parallelism_degree * one_partition_size +
-                   (i % one_partition_size);
-      ptr[base_index + part_idx * stride_size + offset] =
-          host_array.at(data_index++);
+      ptr[i] = host_array.at(data_index++);
     }
 
     in.close();
 
     assert(data_index == one_weight_file_size);
+    // std::cout << "Loaded weight file " << o_file << std::endl;
   }
 }
 
@@ -720,6 +741,28 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   DT *data = (DT *)malloc(sizeof(DT) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
+  bool is_attn_proj = false, is_o_proj = false;
+
+  // dense layers for attention projection is named as
+  // self_attn.qkv_proj or self_attn.o_proj
+  // so looking for self_attn. in the name can determine if it is an attention
+  // projection
+  if (weight_filename.find("attn.") != std::string::npos ||
+      weight_filename.find("self_attention.") != std::string::npos) {
+    size_t pos = weight_filename.find(".o_proj");
+    if (pos != std::string::npos) {
+      weight_filename.replace(pos, std::string(".o_proj").length(), "");
+      is_o_proj = true;
+    } else {
+      pos = weight_filename.find(".qkv_proj");
+      if (pos == std::string::npos) {
+        cout << weight_filename << endl;
+      }
+      assert(pos != std::string::npos);
+      weight_filename.replace(pos, std::string(".qkv_proj").length(), "");
+    }
+    is_attn_proj = true;
+  }
 
   if (ff->config.benchmarking) {
     std::cout << "Initializing weight " << weight_filename
@@ -730,28 +773,51 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-      if (weight_idx == 0) {
-        load_attention_weights_v2(data,
-                                  num_heads,
-                                  num_kv_heads,
-                                  hidden_dim,
-                                  qkv_inner_dim,
-                                  weight_filename,
-                                  weights_folder,
-                                  volume,
-                                  tensor_parallelism_degree);
+    } else if (is_attn_proj) {
+      if (is_o_proj) {
+        if (weight_idx == 0) {
+          load_attention_weights_to_dense_v2(data,
+                                             num_heads,
+                                             num_kv_heads,
+                                             hidden_dim,
+                                             qkv_inner_dim,
+                                             weight_filename,
+                                             weights_folder,
+                                             volume,
+                                             tensor_parallelism_degree,
+                                             true);
+        } else {
+          load_attention_o_proj_bias_to_dense_v2(data,
+                                                 num_heads,
+                                                 num_kv_heads,
+                                                 hidden_dim,
+                                                 qkv_inner_dim,
+                                                 weight_filename,
+                                                 weights_folder);
+        }
       } else {
-        long long value;
-        l->get_int_property("final_bias", value);
-        bool final_bias = (bool)value;
-        load_attention_bias_v2(data,
-                               num_heads,
-                               num_kv_heads,
-                               hidden_dim,
-                               qkv_inner_dim,
-                               final_bias,
-                               weight_filename,
-                               weights_folder);
+        if (weight_idx == 0) {
+          load_attention_weights_to_dense_v2(data,
+                                             num_heads,
+                                             num_kv_heads,
+                                             hidden_dim,
+                                             qkv_inner_dim,
+                                             weight_filename,
+                                             weights_folder,
+                                             volume,
+                                             tensor_parallelism_degree,
+                                             false);
+        } else {
+          load_attention_bias_v2(data,
+                                 num_heads,
+                                 num_kv_heads,
+                                 hidden_dim,
+                                 qkv_inner_dim,
+                                 false, // do not load o_proj bias
+                                 weight_filename,
+                                 weights_folder,
+                                 tensor_parallelism_degree);
+        }
       }
     } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
       assert(weight_idx >= 0 || weight_idx <= 2);
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 1a38782e81..2bc64c1670 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2331,10 +2331,17 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qProjSize);
         sez.serialize(attn->vProjSize);
         sez.serialize(attn->dropout);
-        sez.serialize(attn->qkv_bias);
-        sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2358,10 +2365,17 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qProjSize);
         sez.serialize(attn->vProjSize);
         sez.serialize(attn->dropout);
-        sez.serialize(attn->qkv_bias);
-        sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2382,10 +2396,17 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qProjSize);
         sez.serialize(attn->vProjSize);
         sez.serialize(attn->dropout);
-        sez.serialize(attn->qkv_bias);
-        sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2817,8 +2838,9 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, position_bias;
+        bool add_zero_attn, scaling_query, qk_prod_scaling, offload,
+            position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2830,10 +2852,18 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(k_dim);
         dez.deserialize(v_dim);
         dez.deserialize(dropout);
-        dez.deserialize(qkv_bias);
-        dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2853,11 +2883,9 @@ void FFModel::deserialize_graph_optimal_view(
         params.kdim = k_dim;
         params.vdim = v_dim;
         params.dropout = dropout;
-        params.qkv_bias = qkv_bias;
-        params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
@@ -2874,8 +2902,8 @@ void FFModel::deserialize_graph_optimal_view(
         assert(num_inputs == 1);
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, position_bias;
+        bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
         dez.deserialize(transformer_layer_id);
@@ -2886,10 +2914,18 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(k_dim);
         dez.deserialize(v_dim);
         dez.deserialize(dropout);
-        dez.deserialize(qkv_bias);
-        dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2906,11 +2942,9 @@ void FFModel::deserialize_graph_optimal_view(
         params.kdim = k_dim;
         params.vdim = v_dim;
         params.dropout = dropout;
-        params.qkv_bias = qkv_bias;
-        params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
@@ -2926,8 +2960,9 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, position_bias;
+        bool add_zero_attn, scaling_query, qk_prod_scaling, offload,
+            position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2939,10 +2974,18 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(k_dim);
         dez.deserialize(v_dim);
         dez.deserialize(dropout);
-        dez.deserialize(qkv_bias);
-        dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2962,11 +3005,9 @@ void FFModel::deserialize_graph_optimal_view(
         params.kdim = k_dim;
         params.vdim = v_dim;
         params.dropout = dropout;
-        params.qkv_bias = qkv_bias;
-        params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 1b65dfd869..f39ea91f28 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -800,6 +800,7 @@ void FFModel::compile_inference() {
             false /*must*/,
             0 /*mapper_id*/,
             view.hash() /*MappingTagID*/);
+        index_launcher.concurrent = true;
         FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
         fm.wait_all_results();
         int idx = 0;
diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc
index 8f33f6db87..72e71688c1 100644
--- a/src/runtime/layer.cc
+++ b/src/runtime/layer.cc
@@ -87,6 +87,11 @@ void Layer::add_int_vector_property(std::string const &key,
   int_vector_properties[key] = value;
 }
 
+void Layer::add_string_property(std::string const &key,
+                                std::string const &value) {
+  string_properties[key] = value;
+}
+
 void Layer::add_initializer(std::string const &key, Initializer *initializer) {
   initializers[key] = initializer;
 }
@@ -125,6 +130,18 @@ bool Layer::get_int_vector_property(std::string const &key,
   }
 }
 
+bool Layer::get_string_property(std::string const &key,
+                                std::string &value) const {
+  auto const &it = string_properties.find(key);
+  if (it == string_properties.end()) {
+    assert(false);
+    return false;
+  } else {
+    value = it->second;
+    return true;
+  }
+}
+
 bool Layer::get_initializer(std::string const &key,
                             Initializer *&initializer) const {
   auto const &it = initializers.find(key);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 52f1dd2220..69fe3b598d 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1156,16 +1156,25 @@ bool Op::check_output_input_weight_same_parallel_is() const {
   IndexSpace parallel_is = outputs[0]->parallel_is;
   for (int i = 0; i < numOutputs; i++) {
     if (outputs[i]->parallel_is != parallel_is) {
+      std::cout << "outputs[" << i << "] has different parallel_is "
+                << outputs[i]->parallel_is << " than output[0] " << parallel_is
+                << std::endl;
       return false;
     }
   }
   for (int i = 0; i < numInputs; i++) {
     if (inputs[i]->parallel_is != parallel_is) {
+      std::cout << "inputs[" << i << "] has different parallel_is "
+                << inputs[i]->parallel_is << " than output[0] " << parallel_is
+                << std::endl;
       return false;
     }
   }
   for (int i = 0; i < numWeights; i++) {
     if (weights[i]->parallel_is != parallel_is) {
+      std::cout << "weights[" << i << "] has different parallel_is "
+                << weights[i]->parallel_is << " than output[0] " << parallel_is
+                << std::endl;
       return false;
     }
   }
@@ -3414,26 +3423,28 @@ bool FFModel::need_to_add_allreduce(int layer_idx) const {
   auto const &l = layers[layer_idx];
   if (config.computationMode == COMP_MODE_INFERENCE &&
       config.tensor_parallelism_degree > 1 &&
-      (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-       // mlp layer
-       is_mlp_block(layer_idx) ||
-       // llama mlp layer
-       (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-        layers[layer_idx - 1]->op_type == OP_GELU &&
-        layers[layer_idx - 2]->op_type == OP_LINEAR) ||
-       // LLAMA without element-wise operator fusion
-       (l->op_type == OP_LINEAR && layer_idx >= 5 &&
-        layers[layer_idx - 1]->op_type == OP_EW_MUL &&
-        layers[layer_idx - 2]->op_type == OP_EW_MUL &&
-        layers[layer_idx - 3]->op_type == OP_SIGMOID &&
-        layers[layer_idx - 4]->op_type == OP_LINEAR &&
-        layers[layer_idx - 5]->op_type == OP_LINEAR) ||
-       // LLAMA with element-wise operator fusion
-       (l->op_type == OP_LINEAR && layer_idx >= 3 &&
-        layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
-        layers[layer_idx - 2]->op_type == OP_LINEAR &&
-        layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+      (
+          //  l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+          //  l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+          (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
+          // mlp layer
+          is_mlp_block(layer_idx) ||
+          // llama mlp layer
+          (l->op_type == OP_LINEAR && layer_idx >= 2 &&
+           layers[layer_idx - 1]->op_type == OP_GELU &&
+           layers[layer_idx - 2]->op_type == OP_LINEAR) ||
+          // LLAMA without element-wise operator fusion
+          (l->op_type == OP_LINEAR && layer_idx >= 5 &&
+           layers[layer_idx - 1]->op_type == OP_EW_MUL &&
+           layers[layer_idx - 2]->op_type == OP_EW_MUL &&
+           layers[layer_idx - 3]->op_type == OP_SIGMOID &&
+           layers[layer_idx - 4]->op_type == OP_LINEAR &&
+           layers[layer_idx - 5]->op_type == OP_LINEAR) ||
+          // LLAMA with element-wise operator fusion
+          (l->op_type == OP_LINEAR && layer_idx >= 3 &&
+           layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
+           layers[layer_idx - 2]->op_type == OP_LINEAR &&
+           layers[layer_idx - 3]->op_type == OP_LINEAR))) {
     return true;
   }
   return false;
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index dcac52397a..d5bfcfc48e 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -2,6 +2,7 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/simulator.h"
 #include <stdexcept>
+#include <unistd.h>
 #include <wordexp.h>
 
 namespace FlexFlow {
@@ -29,7 +30,15 @@ fs::path get_dst_folder(std::string const &subdir,
   if (before_kernel) {
     step_substr += "_pre";
   }
+  char cwd[PATH_MAX];
+  getcwd(cwd, sizeof(cwd));
+
+  // char const *ff_cache_path = std::string(std::getenv("FF_DEBUG_PATH")) ==
+  // "." ?
+  //     cwd : std::getenv("FF_DEBUG_PATH");
+
   char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+
   std::string debug_dir_ =
       ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow"
                     : std::string("~/.cache/flexflow/debug/flexflow");
@@ -38,6 +47,9 @@ fs::path get_dst_folder(std::string const &subdir,
   debug_dir_ = p.we_wordv[0];
   wordfree(&p);
   fs::path debug_dir = debug_dir_;
+  if (!fs::is_directory(debug_dir)) {
+    printf("invalid debug directory: %s\n", debug_dir.c_str());
+  }
   assert(fs::is_directory(debug_dir));
   fs::path dst_folder =
       debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx));
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index 9b6510fe5e..0e28c02cdf 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -3734,15 +3734,14 @@ bool FFModel::convert_graph_to_operators(
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(inList.size() == 1);
         IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)node.ptr;
-        new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true);
+        new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0]);
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(inList.size() == 1);
         TreeIncMultiHeadSelfAttention *attn =
             (TreeIncMultiHeadSelfAttention *)node.ptr;
-        new_op =
-            new TreeIncMultiHeadSelfAttention(*this, *attn, inputs[0], true);
+        new_op = new TreeIncMultiHeadSelfAttention(*this, *attn, inputs[0]);
         break;
       }
       case OP_RMS_NORM: {
diff --git a/tests/fine_grained_alignment_test.sh b/tests/fine_grained_alignment_test.sh
new file mode 100755
index 0000000000..9ad26318f9
--- /dev/null
+++ b/tests/fine_grained_alignment_test.sh
@@ -0,0 +1,106 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+MODEL_NAME=${MODEL_NAME:-"JackFram/llama-160m"}
+MEMORY_PER_GPU=${MEMORY_PER_GPU:-14000}
+ZCOPY_MEMORY=${ZCOPY_MEMORY:-40000}
+TP_DEGREE=${TP_DEGREE:-2}
+PP_DEGREE=${PP_DEGREE:-2}
+CACHE_PATH=${FF_CACHE_PATH:-"~/.cache/flexflow"}
+NUM_STEPS=${NUM_STEPS:-2}
+
+cleanup() {
+    rm -rf "${CACHE_PATH}"/debug ./fine_grained_alignment_config.json ./inference/output/fine_grained_alignment_test_ff.txt ./inference/output/fine_grained_alignment_test_hf.txt
+}
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/.."
+
+# Initial cleanup
+cleanup
+
+# Create test prompt file
+mkdir -p ./inference/prompt
+echo '["Three tips for staying healthy are: "]' > ./inference/prompt/test.json
+
+# Create output folder
+mkdir -p ./inference/output
+
+# Enable backtrace in case we run into a segfault or assertion failure
+export LEGION_BACKTRACE=1
+export FF_DEBG_NO_WEIGHTS=1
+FUSION=true
+
+
+# Check if the Python code executed successfully
+if ! PROMPT_LENGTH=$(python -c "
+from transformers import AutoTokenizer
+import os
+tokenizer = AutoTokenizer.from_pretrained(\"$MODEL_NAME\")
+tokens = tokenizer.tokenize('Three tips for staying healthy are: ')
+print(len(tokens))
+");
+then
+    echo "Error: Failed to execute Python code"
+    exit 1
+fi
+
+MAX_LENGTH=$((PROMPT_LENGTH + NUM_STEPS + 1))
+
+python ./tests/inference/huggingface_inference.py \
+    --model-name "${MODEL_NAME}" \
+    --max-length "${MAX_LENGTH}" \
+    --prompt-file ../../inference/prompt/test.json \
+    --output-file ../../inference/output/fine_grained_alignment_test_hf.txt \
+    --use-full-precision \
+    --inference-debugging
+
+NUM_GPUS=$((TP_DEGREE * PP_DEGREE))
+json_config=$(cat <<-END
+    {
+        "num_gpus": ${NUM_GPUS},
+        "memory_per_gpu": ${MEMORY_PER_GPU},
+        "zero_copy_memory_per_node": ${ZCOPY_MEMORY},
+        "num_cpus": 4,
+        "legion_utility_processors": 4,
+        "data_parallelism_degree": 1,
+        "tensor_parallelism_degree": ${TP_DEGREE},
+        "pipeline_parallelism_degree": ${PP_DEGREE},
+        "inference_debugging": true,
+        "fusion": ${FUSION},
+        "refresh_cache": false,
+        "llm_model": "${MODEL_NAME}",
+        "cache_path": "${CACHE_PATH}",
+        "full_precision": true,
+        "prompt": "./inference/prompt/test.json",
+        "max_length": $MAX_LENGTH,
+        "output_file": "./inference/output/fine_grained_alignment_test_ff.txt"
+    }
+END
+)
+echo "$json_config" > ./fine_grained_alignment_config.json
+
+python ./inference/python/incr_decoding.py -config-file ./fine_grained_alignment_config.json
+
+# # C++ test
+# echo "C++ test"
+# ./build/inference/incr_decoding/incr_decoding \
+#     -ll:gpu 2 -ll:cpu 4 -ll:util 4 \
+#     -tensor-parallelism-degree 2 \
+#     -ll:fsize 8192 -ll:zsize 12000 \
+#     -llm-model $MODEL_NAME \
+#     -prompt ./inference/prompt/peft.json \
+#     --use-full-precision \
+#     --inference-debugging
+
+# Check alignment
+python ./tests/inference/inference_alignment_test.py -m "$MODEL_NAME" -tp "$TP_DEGREE" -n "$NUM_STEPS"
+
+# Print succeess message
+echo ""
+echo "Inference alignment tests passed (model ${MODEL_NAME})!"
+echo ""
+
+# Cleanup after the test
+cleanup
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index 5e563c9974..fa72bef463 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -10,30 +10,9 @@
     LlamaTokenizer,
     GenerationConfig,
 )
-######################### debugging helper functions #########################
-def pre_forward_hook(module, input):
-    assert module.name is not None and module.decoding_step is not None
-    name = module.name.replace("model.", "")
-    print(
-        f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}"
-    )
-    print("Pre-Input: ", input[0].shape)
-    torch.save(
-        input, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.input"
-    )
-def post_forward_hook(module, input, output):
-    assert module.name is not None and module.decoding_step is not None
-    name = module.name.replace("model.", "")
-    print(
-        f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}"
-    )
-    print("Post-Input/Output: ", input[0].shape, output[0].shape)
-    torch.save(
-        output, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.output"
-    )
-    print("===")
-    module.decoding_step += 1
-##############################################################################
+import sys
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "peft"))
+from hf_utils import *
 
 def main():
     # Change working dir to folder storing this script
@@ -91,26 +70,20 @@ def main():
     tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
     generation_config = GenerationConfig.from_pretrained(args.model_name)
     generation_config.do_sample = args.do_sample
+    if not args.do_sample:
+        generation_config.num_beams=1
+        generation_config.temperature = None
+        generation_config.top_p = None
     ################# debugging #################
     if args.inference_debugging:
         # Print model and configs
         print(hf_config)
         print(model)
-        # Save weights to file
-        shutil.rmtree("./hf_tensors")
-        # Check that the output folder exists
-        os.makedirs("./hf_tensors", exist_ok=True)
+        make_debug_dirs()
+        register_inference_hooks(model)
         # Save weights
-        for name, params in model.named_parameters():
-            torch.save(params, f"./hf_tensors/{name}")
-            # params.detach().cpu().numpy().tofile(f"./hf_tensors/{name}")
-        # Register hooks to save per-op hidden states
-        for name, layer in dict(model.named_modules()).items():
-            layer.name = name
-            layer.decoding_step = 0
-            print(f"Adding hooks to layer {layer.name}")
-            layer.register_forward_pre_hook(pre_forward_hook)
-            layer.register_forward_hook(post_forward_hook)
+        save_model_weights(model, target_modules=["lora", "lm_head", "final_layer_norm", "self_attn_layer_norm", "out_proj", "fc1", "fc2"])
+
     ###############################################
     # Generate output
     with open(args.output_file, "w") as f:
diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py
new file mode 100644
index 0000000000..6fff4906f7
--- /dev/null
+++ b/tests/inference/inference_alignment_test.py
@@ -0,0 +1,817 @@
+import numpy as np
+import os, torch, argparse, sys
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "peft"))
+from alignment.align_test_utils import *
+from transformers import AutoConfig
+from tqdm import tqdm
+
+class AlignmentTest:
+    def __init__(self, hf_config, tp_degree=1):
+        raise NotImplementedError()
+    def check_weights_alignment(self):
+        raise NotImplementedError()
+    def check_fwd_pass(self):
+        raise NotImplementedError()
+    def check_bwd_pass(self):
+        raise NotImplementedError()
+    def check_step(self, step_idx, learning_rate=0.001):
+        raise NotImplementedError()
+
+class LllamaAlignmentTest(AlignmentTest):
+    def __init__(self, hf_config, tp_degree=1):
+        self.hf_config = hf_config
+        self.num_layers = self.hf_config.num_hidden_layers
+        self.hidden_size = self.hf_config.hidden_size
+        self.intermediate_size = self.hf_config.intermediate_size
+        self.num_attention_heads = self.hf_config.num_attention_heads
+        self.num_key_value_heads = self.hf_config.num_key_value_heads
+        self.projsize = self.hidden_size // self.num_attention_heads
+        self.tp_degree = tp_degree
+
+        self.num_tokens = None
+        self.ff_batch_size = None
+    
+
+    def check_weights_alignment(self):
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "lm_head.weight":
+                f_version = f"layers.{self.num_layers-1}.lm_head.weight_0"
+            elif hf_filename == "norm.weight":
+                f_version = f"layers.{self.num_layers-1}.norm.weight_0"
+            else:
+                f_version = ""
+                if hf_filename.startswith("layers."):
+                    layernum = hf_filename.split("layers.")[1].split(".")[0]
+                    f_version += f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # compute weight index, then rename lora if needed if needed
+                weight_index="0"
+                if "lora_A" in f_version:
+                    weight_index="A"
+                elif "lora_B" in f_version:
+                    weight_index="B"
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+                if f_version.endswith(".weight"):
+                    if weight_index == "0":
+                        f_version += f"_{weight_index}"
+                    else:
+                        f_version += f"_{weight_index}.original"
+                elif f_version.endswith(".gradient"):
+                    prefix = f_version.split(".gradient")[0]
+                    f_version = prefix + f".weight_{weight_index}.gradient"
+            return f_version
+        def get_tp_partition_dim(ff_weight_name) -> int:
+            # MLP layers split the intermediate size dimension
+            # gate_proj, up_proj: [hidden_size, intermediate_size]
+            # down_proj: [intermediate_size, hidden_size]
+            if self.tp_degree == 1:
+                return -1
+            if "lora.weight_B" in ff_weight_name:
+                return -1
+            if "lm_head" in ff_weight_name or "norm" in ff_weight_name:
+                return 1
+            if "gate_proj" in ff_weight_name or "up_proj" in ff_weight_name:
+                return 1
+            elif "down_proj" in ff_weight_name:
+                return 0
+            else:
+                return -1
+        print("-- Weights alignment --")
+        hf_weights_folder = os.path.join(hf_path, "weights", "step_0")
+        ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0")
+        files_list = os.listdir(hf_weights_folder)
+        for hf_weight_name in tqdm(sorted(files_list)):
+            if hf_weight_name.endswith(".weight"):
+                ff_weight_name = convert_hf_filename_to_ff(hf_weight_name)
+                # print(hf_weight_name, ff_weight_name)
+                hf_w_path = os.path.join(hf_weights_folder, hf_weight_name)
+                ff_w_path = os.path.join(ff_weights_folder, ff_weight_name)
+                if not os.path.isfile(hf_w_path):
+                    print(f"File '{hf_w_path}' not found")
+                if not os.path.isfile(ff_w_path):
+                    print(f"File '{ff_w_path}' not found")
+                assert(os.path.isfile(hf_w_path))
+                assert(os.path.isfile(ff_w_path))
+
+                # 1. get shape of hf weight
+                hf_weight = torch.load(hf_w_path, map_location='cpu')
+                hf_weight_shape = hf_weight.shape
+                ff_partition_dim = get_tp_partition_dim(ff_weight_name)
+                ff_weight_shape = list(hf_weight_shape)[::-1]
+                if ff_partition_dim >= 0:
+                    ff_weight_shape[ff_partition_dim] //= self.tp_degree
+                
+                # 2. handle flexflow shards in case of tensor parallelism
+                ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weight_shape) for tp_idx in range(self.tp_degree)]
+                if self.tp_degree > 1:
+                    if ff_partition_dim >= 0:
+                        ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim)
+                    else:
+                        assert(are_np_arrays_identical(ff_weights))
+                        ff_weight = ff_weights[0]
+                else:
+                    ff_weight = ff_weights[0]
+                ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype)
+                
+                # check equivalence
+                try:
+                    torch.testing.assert_close(ff_weight, hf_weight.T)
+                except Exception as e:
+                    print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n")
+                    raise e
+    
+    def check_fwd_pass(self, step_idx=0):
+        hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}")
+        ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
+        
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "embed_tokens":
+                f_version = f"layers.0.embed_tokens"
+            elif hf_filename == "lm_head" or hf_filename == "norm":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}"
+            else:
+                assert hf_filename.startswith("layers.")
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix
+                f_version = f_version.replace(".q_proj", ".qkv_proj").replace(".k_proj", ".qkv_proj").replace(".v_proj", ".qkv_proj")#.replace(".o_proj", "")
+            return f_version
+        
+        def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
+            hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}"
+            hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            print("loading hf tensor: ", hf_tensor_filename)
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            if hf_tensor_name == "embed_tokens":
+                self.num_tokens = hf_tensor.shape[1]
+            return hf_tensor
+        
+        def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE):
+            ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else ""
+            ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else ""
+            ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}"
+            ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename)
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            print("loading ff tensor: ", ff_tensor_filename)
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[0] //= self.tp_degree
+            
+            if "layers.0.embed_tokens.input_0" in ff_tensor_path:
+                # get number of tokens
+                ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',')
+                self.ff_batch_size = ff_tensor.shape[0]
+
+            ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=0)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=0)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens)
+            return ff_tensor
+
+        def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            if additional_ff_tensor is not None:
+                additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype)
+                ff_tensor = ff_tensor - additional_ff_tensor
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance)
+                if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print(hf_tensor.shape)
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                print(ff_tensor.shape)
+                raise e
+
+        print(f"-- FWD pass {step_idx}--")
+
+        # Embedding layer
+        hf_tensor_name = "embed_tokens"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding input")
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding output")
+        
+        # Transformers blocks
+        for i in range(self.num_layers):
+            # Input laye norm
+            hf_tensor_name = f"layers.{i}.input_layernorm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            if i == 0:
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+                output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            else:
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+                output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} input")
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} output")
+
+            # Attention QKV projections
+            hf_q_proj_tensor_name = f"layers.{i}.self_attn.q_proj"
+            hf_k_proj_tensor_name = f"layers.{i}.self_attn.k_proj"
+            hf_v_proj_tensor_name = f"layers.{i}.self_attn.v_proj"
+            ff_qkv_tensor_name = convert_hf_filename_to_ff(hf_q_proj_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_q_proj_in = get_hf_tensor(hf_q_proj_tensor_name, input_comparison)
+            hf_k_proj_in = get_hf_tensor(hf_k_proj_tensor_name, input_comparison)
+            hf_v_proj_in = get_hf_tensor(hf_v_proj_tensor_name, input_comparison)
+            hf_q_proj_out = get_hf_tensor(hf_q_proj_tensor_name, output_comparison)
+            hf_k_proj_out = get_hf_tensor(hf_k_proj_tensor_name, output_comparison)
+            hf_v_proj_out = get_hf_tensor(hf_v_proj_tensor_name, output_comparison)
+            ff_qkv_tensor_in = get_ff_tensor(ff_qkv_tensor_name, input_comparison, hf_q_proj_in.shape)
+            torch.testing.assert_close(hf_q_proj_in, hf_k_proj_in)
+            torch.testing.assert_close(hf_k_proj_in, hf_v_proj_in)
+            compare(hf_q_proj_in, ff_qkv_tensor_in, label=f"QKV proj {i} input")
+            ff_qkv_tensor_out = get_ff_tensor(
+                ff_qkv_tensor_name, 
+                output_comparison, 
+                torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]), 
+                tp_type=TPType.PARTITION
+            )
+            head_dim = hf_q_proj_out.shape[2] // self.num_attention_heads
+            heads_per_shard = self.num_attention_heads // self.tp_degree
+            chunk_size = head_dim * heads_per_shard
+            # print(ff_qkv_tensor_out.shape)
+            ff_qproj_out = ff_qkv_tensor_out[:chunk_size, :, :]
+            ff_kproj_out = ff_qkv_tensor_out[chunk_size:2*chunk_size, :, :]
+            ff_vproj_out = ff_qkv_tensor_out[2*chunk_size : 3*chunk_size, :, :]
+            qkv_chunk_size = 3*chunk_size
+            for tp_idx in range(1, self.tp_degree):
+                prev_size = tp_idx * qkv_chunk_size
+                ff_qproj_out_ = ff_qkv_tensor_out[prev_size : prev_size + chunk_size, :, :]
+                ff_kproj_out_ = ff_qkv_tensor_out[prev_size + chunk_size : prev_size + 2*chunk_size, :, :]
+                ff_vproj_out_ = ff_qkv_tensor_out[prev_size + 2*chunk_size : prev_size + 3*chunk_size, :, :]
+                ff_qproj_out = np.concatenate((ff_qproj_out, ff_qproj_out_), axis=0)
+                ff_kproj_out = np.concatenate((ff_kproj_out, ff_kproj_out_), axis=0)
+                ff_vproj_out = np.concatenate((ff_vproj_out, ff_vproj_out_), axis=0)
+            compare_loaded_tensors(hf_q_proj_out.T, ff_qproj_out)
+            compare_loaded_tensors(hf_k_proj_out.T, ff_kproj_out)
+            compare_loaded_tensors(hf_v_proj_out.T, ff_vproj_out)
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            ff_attn_tensor_in = get_ff_tensor(
+                ff_tensor_name, 
+                input_comparison, 
+                torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]),
+                tp_type=TPType.PARTITION
+            )
+            assert torch.allclose(ff_qkv_tensor_out, ff_attn_tensor_in)
+
+            # Attention
+            hf_tensor_name = f"layers.{i}.self_attn.o_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(f"layers.{i}.self_attn")
+            # the raw attention result, w/o o_proj. This is the output of senf_attn of FF and the input of o_proj in HF
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            # TP for self-attn partitions the attention heads across TP workers
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name)
+            compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
+            
+            # Post-attention layernorm
+            hf_tensor_name = f"layers.{i}.post_attention_layernorm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Post-attention layernorm {i} output")
+
+            # W1 (gate_proj)
+            hf_tensor_name = f"layers.{i}.mlp.gate_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W1 {i} output")
+
+            # W3 (up_proj)
+            hf_tensor_name = f"layers.{i}.mlp.up_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} output")
+
+            # W2 (down_proj)
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} input")
+
+            hf_down_proj_in = hf_tensor.clone()
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+        
+        # Norm
+        hf_tensor_name = "norm"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Norm output")
+
+        # LM head
+        hf_tensor_name = "lm_head"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        compare(hf_tensor, ff_tensor, label="LM head input")
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        compare(hf_tensor, ff_tensor, label="LM head output")
+
+class OPTAlignmentTest(AlignmentTest):
+    def __init__(self, hf_config, tp_degree=1):
+        self.hf_config = hf_config
+        self.num_layers = self.hf_config.num_hidden_layers
+        self.hidden_size = self.hf_config.hidden_size
+        self.intermediate_size = self.hf_config.ffn_dim
+        self.num_attention_heads = self.hf_config.num_attention_heads
+        self.num_key_value_heads = self.num_attention_heads
+        self.projsize = self.hidden_size // self.num_attention_heads
+        self.tp_degree = tp_degree
+
+        self.num_tokens = None
+        self.ff_batch_size = None
+    
+    def check_weights_alignment(self):
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "lm_head.weight" or hf_filename == "final_layer_norm.weight":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}_0"
+            elif hf_filename == "lm_head.bias" or hf_filename == "final_layer_norm.bias":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename.replace('bias', 'weight')}_1"
+            elif hf_filename.startswith("layers.") and hf_filename.endswith("self_attn.out_proj.bias"):
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}.layers.{layernum}.add_bias_residual_layer_norm.weight_0"
+            elif hf_filename.startswith("layers.") and hf_filename.endswith(".final_layer_norm.weight"):
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}.layers.{layernum}.add_bias_residual_layer_norm.weight_1"
+            elif hf_filename.startswith("layers.") and hf_filename.endswith(".final_layer_norm.bias"):
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}.layers.{layernum}.add_bias_residual_layer_norm.weight_2"
+            else:
+                f_version = ""
+                if hf_filename.startswith("layers."):
+                    layernum = hf_filename.split("layers.")[1].split(".")[0]
+                    f_version += f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "").replace("out_proj", "o_proj")
+                # compute weight index, then rename lora if needed if needed
+                weight_index="0"
+                if "lora_A" in f_version:
+                    weight_index="A"
+                elif "lora_B" in f_version:
+                    weight_index="B"
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+                if f_version.endswith(".weight"):
+                    if weight_index == "0":
+                        f_version += f"_{weight_index}"
+                    else:
+                        f_version += f"_{weight_index}.original"
+                elif f_version.endswith(".gradient"):
+                    prefix = f_version.split(".gradient")[0]
+                    f_version = prefix + f".weight_{weight_index}.gradient"
+                elif f_version.endswith(".bias"):
+                    f_version = f_version.replace(".bias", ".weight_1")
+            return f_version
+        def get_tp_partition_dim(ff_weight_name) -> int:
+            # MLP layers split the intermediate size dimension
+            # gate_proj, up_proj: [hidden_size, intermediate_size]
+            # down_proj: [intermediate_size, hidden_size]
+            if self.tp_degree == 1:
+                return -1
+            if "lora.weight_B" in ff_weight_name:
+                return -1
+            if "lm_head" in ff_weight_name or "fc1" in ff_weight_name:
+                return 1
+            elif "fc2" in ff_weight_name or "o_proj.weight" in ff_weight_name:
+                return 0
+            else:
+                return -1
+        def get_bias_tp_partition_dim(ff_weight_name) -> int:
+            if self.tp_degree == 1:
+                return -1
+            elif "lm_head" in ff_weight_name or "fc1" in ff_weight_name:
+                return 0
+            else:
+                return -1
+        print("-- Weights alignment --")
+        hf_weights_folder = os.path.join(hf_path, "weights", "step_0")
+        ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0")
+        files_list = os.listdir(hf_weights_folder)
+        for hf_weight_name in tqdm(sorted(files_list)):
+            if hf_weight_name.endswith(".weight") or hf_weight_name.endswith(".bias"):
+                ff_weight_name = convert_hf_filename_to_ff(hf_weight_name)
+                # print(hf_weight_name, ff_weight_name)
+                hf_w_path = os.path.join(hf_weights_folder, hf_weight_name)
+                ff_w_path = os.path.join(ff_weights_folder, ff_weight_name)
+                if not os.path.isfile(hf_w_path):
+                    print(f"File '{hf_w_path}' not found")
+                if not os.path.isfile(ff_w_path):
+                    print(f"File '{ff_w_path}' not found")
+                assert(os.path.isfile(hf_w_path))
+                assert(os.path.isfile(ff_w_path))
+
+                # 1. get shape of hf weight
+                hf_weight = torch.load(hf_w_path, map_location='cpu')
+                hf_weight_shape = hf_weight.shape
+                ff_partition_dim = get_tp_partition_dim(ff_weight_name) if hf_weight_name.endswith(".weight") else get_bias_tp_partition_dim(ff_weight_name)
+                ff_weight_shape = list(hf_weight_shape)[::-1]
+                # print(ff_partition_dim, ff_weight_name, hf_w_path, ff_weight_shape)
+                if ff_partition_dim >= 0:
+                    ff_weight_shape[ff_partition_dim] //= self.tp_degree
+                
+                # 2. handle flexflow shards in case of tensor parallelism
+                if hf_weight_name.endswith(".bias") and ff_partition_dim == -1:
+                    # unpartitioned bias (E.g. replicated bias) only lives on shard 0
+                    ff_weight = load_ff_tensor(ff_w_path, ff_weight_shape)
+                else:
+                    ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weight_shape) for tp_idx in range(self.tp_degree)]
+                    if self.tp_degree > 1:
+                        if ff_partition_dim >= 0:
+                            ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim)
+                        else:
+                            assert(are_np_arrays_identical(ff_weights))
+                            ff_weight = ff_weights[0]
+                    else:
+                        ff_weight = ff_weights[0]
+                ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype)
+                # print("comparing weight tensor: ", hf_weight_name, " and ", ff_weight_name)
+                # check equivalence
+                try:
+                    torch.testing.assert_close(ff_weight, hf_weight.T)
+                except Exception as e:
+                    print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n")
+                    raise e
+    
+    def check_fwd_pass(self, step_idx=0):
+        hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}")
+        ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
+        
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "embed_tokens" or hf_filename == "embed_positions":
+                f_version = f"layers.0.{hf_filename}"
+            elif hf_filename == "lm_head" or hf_filename == "final_layer_norm":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}"
+            else:
+                assert hf_filename.startswith("layers.")
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix
+                f_version = f_version.replace(".q_proj", ".qkv_proj").replace(".k_proj", ".qkv_proj").replace(".v_proj", ".qkv_proj")
+            return f_version
+        
+        def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
+            hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}"
+            hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            print("loading hf tensor: ", hf_tensor_filename)
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            if hf_tensor_name == "embed_tokens":
+                self.num_tokens = hf_tensor.shape[1]
+            return hf_tensor
+        
+        def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE):
+            ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else ""
+            ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else ""
+            ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}"
+            ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename)
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            print("loading ff tensor: ", ff_tensor_filename)
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[0] //= self.tp_degree
+            
+            if "layers.0.embed_tokens.input_0" in ff_tensor_path:
+                # get number of tokens
+                ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',')
+                self.ff_batch_size = ff_tensor.shape[0]
+
+            ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=0)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=0)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens)
+            return ff_tensor
+
+        def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            if additional_ff_tensor is not None:
+                additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype)
+                ff_tensor = ff_tensor - additional_ff_tensor
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance)
+                if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print(hf_tensor.shape)
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                print(ff_tensor.shape)
+                raise e
+
+        print(f"-- FWD pass {step_idx}--")
+
+        # Embedding layer
+        hf_tensor_name = "embed_tokens"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding input")
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding output")
+
+        # Positional embedding layer
+        hf_tensor_name = "embed_positions"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Position Embedding output")
+        
+        # Transformers blocks
+        for i in range(self.num_layers):
+            # Input layer norm
+            hf_tensor_name = f"layers.{i}.self_attn_layer_norm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Self attention layernorm {i} input")
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Self attention layernorm {i} output")
+
+            # Attention QKV projections
+            hf_q_proj_tensor_name = f"layers.{i}.self_attn.q_proj"
+            hf_k_proj_tensor_name = f"layers.{i}.self_attn.k_proj"
+            hf_v_proj_tensor_name = f"layers.{i}.self_attn.v_proj"
+            ff_qkv_tensor_name = convert_hf_filename_to_ff(hf_q_proj_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_q_proj_in = get_hf_tensor(hf_q_proj_tensor_name, input_comparison)
+            hf_k_proj_in = get_hf_tensor(hf_k_proj_tensor_name, input_comparison)
+            hf_v_proj_in = get_hf_tensor(hf_v_proj_tensor_name, input_comparison)
+            hf_q_proj_out = get_hf_tensor(hf_q_proj_tensor_name, output_comparison)
+            hf_k_proj_out = get_hf_tensor(hf_k_proj_tensor_name, output_comparison)
+            hf_v_proj_out = get_hf_tensor(hf_v_proj_tensor_name, output_comparison)
+            ff_qkv_tensor_in = get_ff_tensor(ff_qkv_tensor_name, input_comparison, hf_q_proj_in.shape)
+            torch.testing.assert_close(hf_q_proj_in, hf_k_proj_in)
+            torch.testing.assert_close(hf_k_proj_in, hf_v_proj_in)
+            compare(hf_q_proj_in, ff_qkv_tensor_in, label=f"QKV proj {i} input")
+            ff_qkv_tensor_out = get_ff_tensor(
+                ff_qkv_tensor_name, 
+                output_comparison, 
+                torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]), 
+                tp_type=TPType.PARTITION
+            )
+            head_dim = hf_q_proj_out.shape[2] // self.num_attention_heads
+            heads_per_shard = self.num_attention_heads // self.tp_degree
+            chunk_size = head_dim * heads_per_shard
+            # print(ff_qkv_tensor_out.shape)
+            ff_qproj_out = ff_qkv_tensor_out[:chunk_size, :, :]
+            ff_kproj_out = ff_qkv_tensor_out[chunk_size:2*chunk_size, :, :]
+            ff_vproj_out = ff_qkv_tensor_out[2*chunk_size : 3*chunk_size, :, :]
+            qkv_chunk_size = 3*chunk_size
+            for tp_idx in range(1, self.tp_degree):
+                prev_size = tp_idx * qkv_chunk_size
+                ff_qproj_out_ = ff_qkv_tensor_out[prev_size : prev_size + chunk_size, :, :]
+                ff_kproj_out_ = ff_qkv_tensor_out[prev_size + chunk_size : prev_size + 2*chunk_size, :, :]
+                ff_vproj_out_ = ff_qkv_tensor_out[prev_size + 2*chunk_size : prev_size + 3*chunk_size, :, :]
+                ff_qproj_out = np.concatenate((ff_qproj_out, ff_qproj_out_), axis=0)
+                ff_kproj_out = np.concatenate((ff_kproj_out, ff_kproj_out_), axis=0)
+                ff_vproj_out = np.concatenate((ff_vproj_out, ff_vproj_out_), axis=0)
+            compare_loaded_tensors(hf_q_proj_out.T, ff_qproj_out)
+            compare_loaded_tensors(hf_k_proj_out.T, ff_kproj_out)
+            compare_loaded_tensors(hf_v_proj_out.T, ff_vproj_out)
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            ff_attn_tensor_in = get_ff_tensor(
+                ff_tensor_name, 
+                input_comparison, 
+                torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]),
+                tp_type=TPType.PARTITION
+            )
+            assert torch.allclose(ff_qkv_tensor_out, ff_attn_tensor_in)
+
+            # Compared scaled qproj
+            hf_tensor_name = f"layers.{i}.self_attn.scaled_qproj"
+            input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            scaled_qproj_in = get_hf_tensor(hf_tensor_name, input_c)
+            scaled_qproj_out = get_hf_tensor(hf_tensor_name, output_c)
+            assert torch.allclose(scaled_qproj_in, scaled_qproj_out)
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.scaled_qkv_proj"
+            scaled_qkv_proj0 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0"), [64*6,3,9])
+            scaled_qkv_proj1 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0").replace("shard_0", "shard_1"), [64*6,3,9])
+            ff_scaled_qkv_proj = np.concatenate([scaled_qkv_proj0, scaled_qkv_proj1], axis=0)
+            ff_scaled_q_proj = torch.from_numpy(ff_scaled_qkv_proj[:, :1, :]).to(scaled_qproj_out.dtype)
+            # print("HF scaled qproj:")
+            # print(scaled_qproj_out.squeeze().T)
+            # print("FF scaled q proj:")
+            # print(ff_scaled_q_proj.squeeze())
+            # print("HF unscaled qproj:")
+            # print(hf_q_proj_out.squeeze().T)
+            # print("FF unscaled qproj:")
+            # print(torch.from_numpy(ff_qproj_out.squeeze()).to(scaled_qproj_out.dtype))
+            # assert torch.allclose(hf_q_proj_out.squeeze().T, ff_scaled_q_proj.squeeze())
+            
+
+
+            # check that out_proj input, attn_scores out and input are identical on the hf side
+            hf_tensor_name = f"layers.{i}.self_attn.attn_scores"
+            input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            attn_scores_in = get_hf_tensor(hf_tensor_name, input_c)
+            attn_scores_out = get_hf_tensor(hf_tensor_name, output_c)
+            hf_tensor_name = f"layers.{i}.self_attn.out_proj"
+            out_proj_in = get_hf_tensor(hf_tensor_name, input_c)
+            assert torch.allclose(attn_scores_in, attn_scores_out)
+            assert torch.allclose(attn_scores_in, out_proj_in)
+
+            # Compare out proj input. This should be the output of the attention without any bias involved
+            hf_tensor_name = f"layers.{i}.self_attn.out_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name)
+            compare(hf_tensor, ff_tensor, label=f"Attention o-proj {i} input")
+            
+            hf_tensor_name = f"layers.{i}.self_attn.attn_scores"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
+
+            # hf_tensor_name = f"layers.{i}.final_layer_norm"
+            # ff_tensor_name = f"layers.{i}.layers.{i}.add_bias_residual_layer_norm"
+            # output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+            # compare(hf_tensor, ff_tensor, label=f"Add Bias Residula LN {i} output 0")
+
+            hf_tensor_name = f"layers.{i}.self_attn.out_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name.replace(".out_proj", ".o_proj"))
+            # # the raw attention result, w/o o_proj. This is the output of senf_attn of FF and the input of o_proj in HF
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            # # TP for self-attn partitions the attention heads across TP workers
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name)
+            # compare(hf_tensor, ff_tensor, label=f"Attention oproj {i} output")
+
+            # hf_tensor_name = f"layers.{i}.self_attn.out_proj"
+            # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            # output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            # print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name)
+            # compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
+            
+            
+            
+            # # Post-attention layernorm
+            # hf_tensor_name = f"layers.{i}.add_bias_residual_layer_norm"
+            # ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            # output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            # compare(hf_tensor, ff_tensor, label=f"Add bias residual layernorm {i} output")
+
+            # FC1 (+ ReLU)
+            hf_tensor_name = f"layers.{i}.activation_fn"
+            ff_tensor_name = convert_hf_filename_to_ff(f"layers.{i}.fc1")
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"FC1 {i} output")
+
+            # FC2
+            hf_tensor_name = f"layers.{i}.fc2"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"FC2 {i} input")
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            # compare(hf_tensor, ff_tensor, label=f"FC2 {i} output")
+            
+            hf_down_proj_in = hf_tensor.clone()
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+        
+        # Norm
+        hf_tensor_name = "final_layer_norm"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Final layer norm output")
+
+        # LM head
+        hf_tensor_name = "lm_head"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        compare(hf_tensor, ff_tensor, label="LM head input")
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        compare(hf_tensor, ff_tensor, label="LM head output")
+
+parser = argparse.ArgumentParser(description='Argument Parser Example') 
+# Adding arguments
+parser.add_argument('-m', '--model-name', type=str, default="goliaro/llama-160m-lora", help='Name of the model')
+parser.add_argument('-n', '--num-steps', type=int, default=1, help='Number of decoding steps')
+parser.add_argument('-tp', '--tensor-parallelism-degree', type=int, default=1, help='The tensor parallelism degree used when running FlexFlow')
+
+# Parse the arguments from command line
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    hf_config = AutoConfig.from_pretrained(args.model_name)
+    alignment_class = None
+    if hf_config.architectures[0] == "LlamaForCausalLM":
+        alignment_class = LllamaAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree)
+    elif hf_config.architectures[0] == "OPTForCausalLM":
+        alignment_class = OPTAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree)
+    
+    # alignment_class.check_weights_alignment()
+    for i in range(args.num_steps):
+        alignment_class.check_fwd_pass(i)
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index 93727bdc89..3085bbda56 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -3,6 +3,8 @@
 from typing import List
 from enum import Enum
 from dataclasses import dataclass
+import warnings
+
 
 abs_dirname = os.path.dirname(os.path.abspath(__file__))
 cache_folder = os.path.expanduser(os.getenv("FF_CACHE_PATH", "~/.cache/flexflow"))
@@ -472,7 +474,16 @@ def replace_value(lst, old_value, new_value):
     if occurrences == 0:
         raise ValueError(f"Value {old_value} not found in the list.")
     elif occurrences > 1:
-        raise ValueError(f"Multiple instances of {old_value} found in the list.")
+        warnings.warn(f"Multiple instances of {old_value} found in the list.")
+        occurrence_idx=0
+        for i, value in enumerate(lst):
+            if value == old_value:
+                occurrence_idx += 1
+                if occurrence_idx == 2:
+                    lst[i] = new_value
+                    break
+        return lst
+        # raise ValueError(f"Multiple instances of {old_value} found in the list.")
     else:
         index = lst.index(old_value)
         lst[index] = new_value
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 16b46cfa81..a2fc5548ab 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -77,7 +77,7 @@ def main():
     if args.save_peft_tensors:
         make_debug_dirs()
         register_peft_hooks(model)
-        save_peft_weights(model, target_modules=["lora", "lm_head", "down_proj"])
+        save_model_weights(model, target_modules=["lora", "lm_head", "down_proj"])
 
     # Load fine-tuning dataset
     data = load_dataset("Abirate/english_quotes")
diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py
index 9332c803b2..94fb96f029 100644
--- a/tests/peft/hf_utils.py
+++ b/tests/peft/hf_utils.py
@@ -40,7 +40,7 @@ def get_dst_folder(subdir, step_idx=0):
 
 
 def simplify_name(name):
-    return name.replace("base_model.model.model.", "").replace("base_model.model.", "")
+    return name.replace("base_model.model.model.", "").replace("base_model.model.", "").replace("model.layers.", "layers.").replace("model.", "").replace("decoder.", "")
 
 
 def get_optim_type(args):
@@ -114,7 +114,7 @@ def peft_backward_hook(module, grad_input, grad_output):
     module.bwd_step += 1
 
 
-def peft_forward_hook(module, input, output):
+def fwd_hook(module, input, output):
     if len(input) == 0 or len(output) == 0:
         return
     assert module.name is not None and module.fwd_step is not None
@@ -312,11 +312,18 @@ def register_peft_hooks(model):
         layer.bwd_step = 0
         if verbose:
             print(f"Adding hooks to layer {layer.name}")
-        layer.register_forward_hook(peft_forward_hook)
+        layer.register_forward_hook(fwd_hook)
         layer.register_full_backward_hook(peft_backward_hook)
 
+def register_inference_hooks(model):
+    for name, layer in dict(model.named_modules()).items():
+        layer.name = name
+        layer.fwd_step = 0
+        if verbose:
+            print(f"Adding hooks to layer {layer.name}")
+        layer.register_forward_hook(fwd_hook)
 
-def save_peft_weights(model, target_modules=[]):
+def save_model_weights(model, target_modules=[]):
     # Save any weights of interest
     for name, params in model.named_parameters():
         simplified_name = simplify_name(name)
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index 266bb64137..cc677cd51a 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -98,14 +98,14 @@ def get_tp_partition_dim(ff_weight_name) -> int:
 
                 # 1. get shape of hf weight
                 hf_weight = torch.load(hf_w_path, map_location='cpu')
-                hf_weigth_shape = hf_weight.shape
+                hf_weight_shape = hf_weight.shape
                 ff_partition_dim = get_tp_partition_dim(ff_weight_name)
-                ff_weigth_shape = list(hf_weigth_shape)[::-1]
+                ff_weight_shape = list(hf_weight_shape)[::-1]
                 if ff_partition_dim >= 0:
-                    ff_weigth_shape[ff_partition_dim] //= self.tp_degree
+                    ff_weight_shape[ff_partition_dim] //= self.tp_degree
                 
                 # 2. handle flexflow shards in case of tensor parallelism
-                ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weigth_shape) for tp_idx in range(self.tp_degree)]
+                ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weight_shape) for tp_idx in range(self.tp_degree)]
                 if self.tp_degree > 1:
                     if ff_partition_dim >= 0:
                         ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim)
@@ -149,6 +149,7 @@ def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
 
             if not os.path.isfile(hf_tensor_path):
                 raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            print("loading hf tensor: ", hf_tensor_filename)
             hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
             if hf_tensor_name == "embed_tokens":
                 self.num_tokens = hf_tensor.shape[1]
@@ -162,6 +163,7 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp
             if not os.path.isfile(ff_tensor_path):
                 raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
 
+            print("loading ff tensor: ", ff_tensor_filename)
             ff_shape = list(hf_shape)[::-1]
             if tp_type == TPType.PARTITION:
                 ff_shape[0] //= self.tp_degree
@@ -206,8 +208,10 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
                 print(f"Error in comparison {label}:\n{e}\n")
                 print("HF tensor:")
                 print(hf_tensor.squeeze())
+                print(hf_tensor.shape)
                 print("FF tensor:")
                 print(ff_tensor.squeeze())
+                print(ff_tensor.shape)
                 raise e
 
         print(f"-- FWD pass {step_idx}--")
@@ -245,9 +249,13 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             # Attention
             hf_tensor_name = f"layers.{i}.self_attn.o_proj"
             ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
-            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # the raw attention result, w/o o_proj. This is the output of senf_attn of FF and the input of o_proj in HF
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
-            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            # TP for self-attn partitions the attention heads across TP workers
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name)
             compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
             
             # Post-attention layernorm
@@ -365,6 +373,7 @@ def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
 
             if not os.path.isfile(hf_tensor_path):
                 raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            print("loading hf tensor: ", hf_tensor_filename)
             hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
             return hf_tensor
         
@@ -378,6 +387,7 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp
                 ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre")
             if not os.path.isfile(ff_tensor_path):
                 raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+            print("loading ff tensor: ", ff_tensor_filename)
 
             ff_shape = list(hf_shape)[::-1]
             if tp_type == TPType.PARTITION:
@@ -392,8 +402,10 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp
                         tensor_comparison_idx.ff_tensor_type == "output_gradient" or
                         tensor_comparison_idx.ff_tensor_type == "input_gradient"
                     )
-                )
+                ) and
+                not ff_tensor_name.endswith(".self_attn.qkv_proj")
             )
+            print(ff_tensor_filename + (" is not truncated" if intermediate_attention_tensor else " is truncated"))
             if not intermediate_attention_tensor:
                 ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
             
@@ -432,8 +444,10 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
                 print(f"Error in comparison {label}:\n{e}\n")
                 print("HF tensor:")
                 print(hf_tensor.squeeze())
+                print(hf_tensor.shape)
                 print("FF tensor:")
                 print(ff_tensor.squeeze())
+                print(ff_tensor.shape)
                 raise e
         
         print(f"-- BWD pass {step_idx}--")
@@ -533,11 +547,12 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
 
             # Attn O-proj
             hf_tensor_name = f"layers.{i}.self_attn.o_proj"
-            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj"
+            # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
-            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
-            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
-            compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output")
+            # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            # compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output")
             ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj"
             input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
@@ -579,7 +594,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             
             # FF Attn input with HF layernorm out
             hf_tensor_name = f"layers.{i}.input_layernorm"
-            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.qkv_proj"
             input_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)

From 6da4f4ad0cb20cbc54da9acb9d736fdbb34a082e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 1 Oct 2024 04:41:28 +0000
Subject: [PATCH 30/44] Add support for max_new_tokens parameter

---
 include/flexflow/batch_config.h               |  4 +-
 include/flexflow/flexflow_c.h                 |  3 +-
 include/flexflow/request_manager.h            |  3 +-
 inference/incr_decoding/incr_decoding.cc      |  2 +-
 inference/peft/peft.cc                        |  2 +-
 inference/peft/peft_bwd_benchmark.cc          |  6 +-
 inference/peft/peft_fwd_benchmark.cc          |  2 +-
 inference/peft/req_rate_benchmark.cc          |  8 +-
 inference/spec_infer/spec_infer.cc            |  2 +-
 python/flexflow/core/flexflow_cffi.py         | 59 +++++------
 python/flexflow/serve/serve.py                | 11 ++-
 src/c/flexflow_c.cc                           | 32 ++++--
 src/ops/add_bias_residual_layer_norm.cpp      |  2 +-
 src/ops/add_bias_residual_layer_norm.cu       |  2 +-
 src/ops/kernels/linear_kernels.cpp            |  2 +-
 src/ops/kernels/linear_kernels.cu             |  2 +-
 src/ops/kernels/lora_linear_kernels.cpp       |  2 +-
 src/ops/kernels/lora_linear_kernels.cu        |  2 +-
 src/ops/kernels/residual_rms_norm_kernels.cpp |  2 +-
 src/ops/kernels/residual_rms_norm_kernels.cu  |  2 +-
 src/ops/kernels/rms_norm_kernels.cpp          |  2 +-
 src/ops/kernels/rms_norm_kernels.cu           |  2 +-
 src/ops/layer_norm.cpp                        |  2 +-
 src/ops/layer_norm.cu                         |  2 +-
 src/ops/residual_layer_norm.cpp               |  2 +-
 src/ops/residual_layer_norm.cu                |  2 +-
 src/ops/sigmoid_silu_multi.cpp                |  2 +-
 src/ops/sigmoid_silu_multi.cu                 |  2 +-
 src/runtime/batch_config.cc                   |  4 +-
 src/runtime/beam_search_batch_config.cc       |  4 +-
 src/runtime/request_manager.cc                | 97 ++++++++++++-------
 src/runtime/tree_verify_batch_config.cc       |  4 +-
 32 files changed, 164 insertions(+), 111 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 873fed0bdb..a509af765c 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -87,7 +87,7 @@ class BatchConfig {
       first_token_depth_in_request = 0;
       first_token_offset_in_batch = 0;
       num_tokens_in_batch = 0;
-      max_sequence_length = 0;
+      max_length = 0;
       request_guid = 0;
       prompt_phase = false;
       batch_config_request_id = -1;
@@ -98,7 +98,7 @@ class BatchConfig {
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
-    int max_sequence_length;
+    int max_length;
 
     // request id in batch config:
     int batch_config_request_id = -1;
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index c1e18e660b..52f67d8efb 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -651,7 +651,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              enum RequestType *request_types,
                              char const **input_texts,
                              char **output_texts,
-                             int *max_seq_lengths,
+                             int *max_lengths,
+                             int *max_new_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f0fab957ee..36a56012fc 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -67,7 +67,8 @@ struct Request {
   };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
-  int max_sequence_length = 128;
+  int max_length = -1;
+  int max_new_tokens = 128;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c9ffff5c07..f8e16f24fa 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -271,7 +271,7 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
+      inference_req.max_length = 128;
       requests.push_back(inference_req);
       total_num_requests++;
     }
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index c55f2c0bfd..ee5bd1b460 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
         printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
         Request inference_req;
         inference_req.prompt = text;
-        inference_req.max_sequence_length = 128;
+        inference_req.max_length = 128;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index 86d6d8cbbf..df9a1e35db 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -308,7 +308,7 @@ void FlexFlow::top_level_task(Task const *task,
     for (int i = 0; i < 100; i++) {
       Request inference_req;
       inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
+      inference_req.max_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -317,7 +317,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.warmup = true;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -361,7 +361,7 @@ void FlexFlow::top_level_task(Task const *task,
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
       fine_tuning_req.benchmarking_tokens = lengths[i];
-      fine_tuning_req.max_sequence_length = lengths[i];
+      fine_tuning_req.max_length = lengths[i];
       fine_tuning_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       fine_tuning_req.max_training_steps = 1;
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
index 9ff042c157..9b020f5954 100644
--- a/inference/peft/peft_fwd_benchmark.cc
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -333,7 +333,7 @@ void FlexFlow::top_level_task(Task const *task,
       // sequence_length);
       Request inference_req;
       inference_req.benchmarking_tokens = prompt.first;
-      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.max_length = prompt.second + prompt.first;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       requests.push_back(inference_req);
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index 43008e74fe..cde3b1c02e 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -369,7 +369,7 @@ void FlexFlow::top_level_task(Task const *task,
     for (int i = 0; i < 100; i++) {
       Request inference_req;
       inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
+      inference_req.max_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -379,7 +379,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.warmup = true;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -443,7 +443,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
     fine_tuning_req.max_training_steps = 1000000000;
@@ -473,7 +473,7 @@ void FlexFlow::top_level_task(Task const *task,
         // sequence_length);
         Request inference_req;
         inference_req.benchmarking_tokens = prompt.first;
-        inference_req.max_sequence_length = prompt.second + prompt.first;
+        inference_req.max_length = prompt.second + prompt.first;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 9689080825..134ae70c4a 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -421,7 +421,7 @@ void FlexFlow::top_level_task(Task const *task,
       // Add inference request
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
+      inference_req.max_length = 128;
       requests.push_back(inference_req);
       total_num_requests++;
     }
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index a5aadc270e..9b35b249d9 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -38,9 +38,10 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
-from typing import Union, List
+from typing import Union, List, Optional
+from dataclasses import dataclass
 from peft import LoraConfig
-import json
+import json, math
 from dataclasses import dataclass
 
 
@@ -2050,25 +2051,16 @@ def no_id_handle():
 # Request
 # -----------------------------------------------------------------------
 
-
+@dataclass
 class Request:
     """A class to record the metadata of an inference or finetuning request."""
-
-    def __init__(
-        self,
-        req_type: RequestType,
-        prompt: str = None,
-        max_sequence_length: int = 128,
-        peft_model_id: PEFTModelID = None,
-        dataset_filepath: str = None,
-        max_training_steps: int = 1,
-    ):
-        self.req_type = req_type
-        self.prompt = prompt
-        self.max_sequence_length = max_sequence_length
-        self.peft_model_id = peft_model_id
-        self.dataset_filepath = dataset_filepath
-        self.max_training_steps = max_training_steps
+    req_type: RequestType
+    prompt: Optional[str] = None
+    max_length: int = -1
+    max_new_tokens: int = 128
+    peft_model_id: Optional[PEFTModelID] = None
+    dataset_filepath: Optional[str] = None
+    max_training_steps: int = 1
 
 
 # -----------------------------------------------------------------------
@@ -4658,19 +4650,23 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
+    def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128):
+        if max_length != -1 and max_new_tokens != -1:
+            warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.")
         assert isinstance(prompt_list, list)
         c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
+        estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length
+        max_num_chars = 5 * (estimated_max_tokens + 100)
         c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
         c_output_length_and_tokens = [
-            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+            ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list
         ]
         c_request_types = [
             enum_to_int(RequestType, RequestType.REQ_INFERENCE)
             for prompt in prompt_list
         ]
-        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        max_lengths = [max_length for prompt in prompt_list]
+        max_new_tokens_ = [max_new_tokens for prompt in prompt_list]
         peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
         dataset_filepaths = [ffi.NULL for prompt in prompt_list]
         training_steps = [0 for prompt in prompt_list]
@@ -4682,7 +4678,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1
             c_request_types,
             c_input_texts,
             c_output_texts,
-            max_sequence_lengths,
+            max_lengths,
+            max_new_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,
@@ -4719,9 +4716,16 @@ def generate(self, requests_list: List[Request]):
         c_request_types = [
             enum_to_int(RequestType, request.req_type) for request in requests_list
         ]
-        max_sequence_lengths = [
-            request.max_sequence_length for request in requests_list
+        max_lengths = [
+            request.max_length for request in requests_list
         ]
+        max_new_tokens_ = [
+            request.max_new_tokens for request in requests_list
+        ]
+        for i in range(len(requests_list)):
+            if max_lengths[i] != -1 and max_new_tokens_[i] != -1:
+                warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.")
+        
         peft_model_ids = [
             (
                 request.peft_model_id
@@ -4745,7 +4749,8 @@ def generate(self, requests_list: List[Request]):
             c_request_types,
             c_input_texts,
             c_output_texts,
-            max_sequence_lengths,
+            max_lengths,
+            max_new_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 132c50995b..e3b6b47466 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -498,12 +498,17 @@ def compile(
     def generate(
         self,
         requests_or_prompts: Union[str, List[str], Request, List[Request]],
-        max_length: int = 128,
+        max_length: int = -1,
+        max_new_tokens: int = 128,
     ):
         """Generate tokens based on the input prompt(s)
 
         :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
         :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
+        :param max_length: The maximum length in tokens of the prompt + generated sequence, defaults to -1 (no maximum length)
+        :type max_length: int, optional
+        :param max_new_tokens: The maximum number of new tokens (excluding the prompt) to generate, defaults to 128
+        :type max_new_tokens: int, optional
         :return: the generation results
         :rtype: GenerationResult
         """
@@ -511,7 +516,7 @@ def generate(
             if len(requests_or_prompts) == 0:
                 return None
             return self.model.ffmodel.generate_inf_only(
-                [requests_or_prompts], max_length
+                [requests_or_prompts], max_length, max_new_tokens
             )
         elif type(requests_or_prompts) == Request:
             return self.model.ffmodel.generate(requests_or_prompts)
@@ -520,7 +525,7 @@ def generate(
                 return []
             if type(requests_or_prompts[0]) == str:
                 return self.model.ffmodel.generate_inf_only(
-                    requests_or_prompts, max_length
+                    requests_or_prompts, max_length, max_new_tokens
                 )
             else:
                 print(requests_or_prompts)
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index c6cf656ac0..bfa60a6d54 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1683,7 +1683,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              enum RequestType *request_types,
                              char const **input_texts,
                              char **output_texts,
-                             int *max_seq_lengths,
+                             int *max_lengths,
+                             int *max_new_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,
@@ -1698,21 +1699,24 @@ void flexflow_model_generate(flexflow_model_t handle_,
       std::string const text_str(input_texts[i]);
       Request inference_req;
       inference_req.prompt = text_str;
-      inference_req.max_sequence_length = max_seq_lengths[i];
+      inference_req.max_length = max_lengths[i];
+      inference_req.max_new_tokens = max_new_tokens_[i];
       PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
       if (peft_model_id != nullptr) {
         inference_req.peft_model_id = *peft_model_id;
       }
       requests.push_back(inference_req);
-      DEBUG_PRINT("[Model] generate[%d] %p %s %i",
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i %i",
                   i,
                   handle,
                   text_str.c_str(),
-                  max_seq_lengths[i]);
+                  max_lengths[i],
+                  max_new_tokens_[i]);
     } else if (request_types[i] == RequestType::REQ_FINETUNING) {
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = max_seq_lengths[i];
+      fine_tuning_req.max_length = max_lengths[i];
+      fine_tuning_req.max_new_tokens = max_new_tokens_[i];
       PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
       if (peft_model_id != nullptr) {
         fine_tuning_req.peft_model_id = *peft_model_id;
@@ -1721,11 +1725,12 @@ void flexflow_model_generate(flexflow_model_t handle_,
       fine_tuning_req.dataset_filepath = dataset_fp;
       fine_tuning_req.max_training_steps = training_steps[i];
       requests.push_back(fine_tuning_req);
-      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i",
+      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i",
                   i,
                   handle,
                   dataset_fp.c_str(),
-                  max_seq_lengths[i],
+                  max_lengths[i],
+                  max_new_tokens[i],
                   training_steps[i]);
     } else {
       assert(false && "Unknown request type");
@@ -1739,8 +1744,17 @@ void flexflow_model_generate(flexflow_model_t handle_,
       // If the prompt exceeds max seq len, check that we return the prompt with
       // no additional token. Otherwise, check that the output does not exceed
       // the max sequence length.
-      assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
-             results[i].output_tokens.size() == results[i].input_tokens.size());
+      int total_tokens = results[i].output_tokens.size();
+      int num_output_tokens = total_tokens - results[i].input_tokens.size();
+      if (max_new_tokens_[i] >= 0) {
+        assert(num_output_tokens <= max_new_tokens_[i]);
+      }
+      if (max_lengths[i] >= 0) {
+        assert(total_tokens <= max_lengths[i] || num_output_tokens == 0);
+      }
+      // assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
+      //        results[i].output_tokens.size() ==
+      //        results[i].input_tokens.size());
       output_length_and_tokens[i][0] = results[i].output_tokens.size();
       std::copy(results[i].output_tokens.begin(),
                 results[i].output_tokens.end(),
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 681f55c998..cb140e0c75 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -224,7 +224,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index bcca1ba2c6..2d2707f10b 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -222,7 +222,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index a36d6719c9..6b371b840e 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -238,7 +238,7 @@ void inference_kernel_wrapper(LinearMeta *m,
           continue;
         }
         int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int max_peft_tokens = bc->requestsInfo[i].max_length;
         int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
         if (bc->requestsInfo[i].peft_bwd) {
           size_t activation_size_needed =
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 3835d258e0..3832428c64 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -239,7 +239,7 @@ void inference_kernel_wrapper(LinearMeta *m,
           continue;
         }
         int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int max_peft_tokens = bc->requestsInfo[i].max_length;
         int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
         if (bc->requestsInfo[i].peft_bwd) {
           size_t activation_size_needed =
diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp
index c3c2cce3cf..eab8899167 100644
--- a/src/ops/kernels/lora_linear_kernels.cpp
+++ b/src/ops/kernels/lora_linear_kernels.cpp
@@ -249,7 +249,7 @@ void inference_kernel(LoraLinearMeta *m,
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_state.end());
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 5f130782aa..93e5820f9c 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -248,7 +248,7 @@ void inference_kernel(LoraLinearMeta *m,
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_state.end());
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 016364edfd..cbdb8ee153 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -273,7 +273,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 0d44f0260a..285a5a5b8f 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -270,7 +270,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 4158628005..551cb72022 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -227,7 +227,7 @@ void inference_kernel_wrapper(RMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index dd6ada864d..8f59d65ea7 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -225,7 +225,7 @@ void inference_kernel_wrapper(RMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 27d314e21e..2fe4a85905 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -256,7 +256,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 0801d11617..b08b23819c 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -255,7 +255,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index ed973b4f71..57c9ee1418 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -283,7 +283,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 50c81d2099..c4f5866c2f 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -281,7 +281,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index ceaa1a7788..50a358beab 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -130,7 +130,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         size_t input_tensor_size =
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 929d557a17..ca0168a59d 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -129,7 +129,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         size_t input_tensor_size =
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 4c339750c7..a4bf960a2c 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -162,8 +162,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    Max sequence length: " << bc.requestsInfo[i].max_length
+         << std::endl;
       os << "    BatchConfig Req ID: "
          << bc.requestsInfo[i].batch_config_request_id << std::endl;
       os << "    Prompt phase: " << bc.requestsInfo[i].prompt_phase
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index b10f8e82ab..83e4390993 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -141,8 +141,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
       os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
          << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    Max sequence length: " << bc.requestsInfo[i].max_length
+         << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
       os << "    Beam Search Specific: " << std::endl;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 31a32dd3c8..44b181fcb3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -54,7 +54,8 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
   os << "Request {\n";
   os << "  guid: " << req.guid << "\n";
   os << "  peft_model_id: " << req.peft_model_id << "\n";
-  os << "  max_sequence_length: " << req.max_sequence_length << "\n";
+  os << "  max_length: " << req.max_length << "\n";
+  os << "  max_new_tokens: " << req.max_new_tokens << "\n";
   os << "  initial_len: " << req.initial_len << "\n";
   os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
   os << "  llm_cache_size: " << req.llm_cache_size << "\n";
@@ -261,24 +262,45 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = request_.max_sequence_length;
+  request.max_length = request_.max_length;
+  request.max_new_tokens = request_.max_new_tokens;
+  if (request.max_length != -1 && request.max_new_tokens != -1) {
+    std::cout
+        << "Both `max_new_tokens` (=" << request.max_new_tokens
+        << ") and `max_length`(=" << request.max_length
+        << ") seem to have been set. `max_new_tokens` will take precedence.";
+  }
   request.peft_model_id = request_.peft_model_id;
   request.warmup = request_.warmup;
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
   if (request_.benchmarking_tokens >= 0) {
-    assert(request_.benchmarking_tokens < get_max_sequence_length());
+    assert(request_.benchmarking_tokens < get_max_sequence_length() &&
+           "Benchmarking tokens exceed max sequence length");
     request.benchmarking_tokens = request_.benchmarking_tokens;
     request.tokens.insert(request.tokens.end(),
                           request_.benchmarking_tokens,
                           15); // insert random number
   } else {
     std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
+    // from here on, we will only use the max_length parameter
+    if (request.max_new_tokens != -1) {
+      request.max_length = tokens.size() + request.max_new_tokens;
+    }
+    // check that max sequence length is not exceeded
+    // 1. prompt itself should be less than max sequence length
     if (tokens.size() >= get_max_sequence_length()) {
-      std::cout << "Warning: too many tokens in prompt, only load up to "
-                << get_max_sequence_length() << " tokens, but got "
-                << tokens.size() << ".\n";
+      std::cout << "Error: prompt (" << tokens.size()
+                << " tokens) exceeds max sequence length of "
+                << get_max_sequence_length() << ".\n";
+      return INVALID_GUID;
+    }
+    // 2. max_length should not exceed the max_sequence_length
+    if (request.max_length >= get_max_sequence_length()) {
+      std::cout << "Error: max_length (" << request.max_length
+                << ") exceeds max sequence length of "
+                << get_max_sequence_length() << ".\n";
       return INVALID_GUID;
     }
     for (int i = 0; i < tokens.size(); i++) {
@@ -341,7 +363,18 @@ RequestManager::RequestGuid
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
   request.initial_len = 0;
-  request.max_sequence_length = request_.max_sequence_length;
+  request.max_length = request_.max_length;
+  request.max_new_tokens = request_.max_new_tokens;
+  if (request.max_length != -1) {
+    std::cout << "Warning: max_length is set for PEFT finetuning, but it will "
+                 "be ignored."
+              << std::endl;
+  }
+  if (request.max_new_tokens != -1) {
+    std::cout << "Warning: max_new_tokens is set for PEFT finetuning, but "
+                 "it will be ignored."
+              << std::endl;
+  }
   request.peft_model_id = request_.peft_model_id;
   request.req_type = RequestType::REQ_FINETUNING;
   request.completed_training_steps = 0;
@@ -352,7 +385,8 @@ RequestManager::RequestGuid
 
   // Load dataset
   if (request_.benchmarking_tokens >= 0) {
-    assert(request_.benchmarking_tokens <= get_max_sequence_length());
+    assert(request_.benchmarking_tokens <= get_max_sequence_length() &&
+           "Benchmarking tokens exceed max sequence length");
     request.benchmarking_tokens = request_.benchmarking_tokens;
     std::vector<int32_t> input_tokens;
     std::vector<int32_t> output_tokens;
@@ -385,9 +419,10 @@ RequestManager::RequestGuid
           this->tokenizer_->Encode(output_text);
       if (input_tokens.size() + output_tokens.size() >
           get_max_sequence_length()) {
-        std::cout << "Warning: too many tokens in sample, only load up to "
-                  << get_max_sequence_length() << " tokens, but got "
-                  << input_tokens.size() + output_tokens.size() << ".\n";
+        std::cout << "Error: sample in training dataset is "
+                  << input_tokens.size() + output_tokens.size()
+                  << " tokens long, exceeding the maximum sequence length of "
+                  << get_max_sequence_length() << " tokens.\n";
         return INVALID_GUID;
       } else {
         request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
@@ -515,7 +550,7 @@ bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
   Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
   bool request_completed = false;
   // printf("model_type = %d\n", this->model_type);
-  if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+  if (request.tokens.size() >= old_bc.requestsInfo[i].max_length) {
     request_completed = true;
   } else if (request.tokens.back() == eos_token_id) {
     // Encounter EOS token id
@@ -698,8 +733,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].peft_model_id =
             old_bc.requestsInfo[i].peft_model_id;
         new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
+        new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
         if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
@@ -765,8 +799,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].num_tokens_in_batch =
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                      (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
+        new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
@@ -932,8 +965,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.num_active_infr_tokens();
       new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
           num_peft_tokens;
-      new_bc.requestsInfo[inference_batch_size].max_sequence_length =
-          request.max_sequence_length;
+      new_bc.requestsInfo[inference_batch_size].max_length = request.max_length;
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
       new_bc.requestsInfo[inference_batch_size].peft_model_id =
           request.peft_model_id;
@@ -1076,10 +1108,10 @@ BeamSearchBatchConfig
                         verified_tokens.size());
       // check if the request is finished
       if (verified_tokens.size() + request.tokens.size() >=
-          request.max_sequence_length) {
+          request.max_length) {
         // Append all verified tokens to the request
         for (auto const &token_pair : verified_tokens) {
-          if (token_pair.second < request.max_sequence_length) {
+          if (token_pair.second < request.max_length) {
             request.tokens.push_back(token_pair.first);
           }
         }
@@ -1171,14 +1203,13 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
+        new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
         int new_max_depth =
-            new_bc.requestsInfo[i].max_sequence_length -
+            new_bc.requestsInfo[i].max_length -
             new_bc.requestsInfo[i].first_token_depth_in_request -
             verified_tokens.size();
         new_bc.beamRequestsInfo[i].current_depth = 1;
@@ -1254,8 +1285,7 @@ BeamSearchBatchConfig
           request.ssm_cache_size;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
       new_bc.requestsInfo[i].num_tokens_in_batch = 0;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
@@ -1307,8 +1337,7 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[i].num_tokens_in_batch =
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                      (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
+        new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // add profile_info for the new request
@@ -1484,8 +1513,7 @@ BeamSearchBatchConfig
       new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
       profiling_requests[request.guid].ssm_decoding_steps += 1;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
       // update the beam search metadata
@@ -1613,8 +1641,7 @@ BeamSearchBatchConfig
       new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       // update the beam search metadata
@@ -1816,8 +1843,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid =
           old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length =
+          old_batches.at(0).requestsInfo[i].max_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       // copy bitmask to verify batchconfig
@@ -1958,8 +1985,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid =
           old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length =
+          old_batches.at(0).requestsInfo[i].max_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       new_bc.request_completed[i] = false;
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index a71b1070b2..f8ac6089fe 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -58,8 +58,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
       os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
          << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    Max sequence length: " << bc.requestsInfo[i].max_length
+         << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
     }

From dbd4cf170a6cf47d7d471db50f60d11db2fcb58f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 10 Oct 2024 03:52:52 +0000
Subject: [PATCH 31/44] fix

---
 src/ops/inc_multihead_self_attention.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 2802dd41b6..454926bcdb 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -126,7 +126,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {

From 2bfa56cea6fe63837b4a1e3b9ee9737236fe73a8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 18 Oct 2024 22:56:03 -0400
Subject: [PATCH 32/44] Update LLAMA tokenizer (#1524)

* fix tokenizer conversion

* update

* update

* update

* fix

* fix

* lint

* simplify api

* fix

* fix

* fix

* update to 12.1 (#1512)

* fix deadlock?

* remove barrier where not strictly needed

---------

Co-authored-by: zhihao <email>
---
 .github/workflows/gpu-ci.yml                  |   8 +-
 cmake/nccl.cmake                              |  11 +-
 docker/flexflow-environment/Dockerfile        |  24 ++--
 .../ops/kernels/lora_linear_kernels.h         |  11 +-
 include/flexflow/optimizer.h                  |  11 +-
 include/flexflow/request_manager.h            |   3 +-
 inference/peft/peft.cc                        |   2 +-
 inference/python/ff_peft.py                   |   3 +-
 inference/python/incr_decoding.py             |  17 ++-
 inference/python/spec_infer.py                |  24 +++-
 python/flexflow/core/flexflow_cffi.py         | 123 +++++++++++++-----
 python/flexflow/serve/serve.py                | 100 +++++++++++---
 src/ops/fused.cc                              |   2 -
 src/ops/fused.cpp                             |  11 +-
 src/ops/fused.cu                              |  11 +-
 src/ops/inc_multihead_self_attention.cpp      |   2 +-
 src/ops/kernels/lora_linear_kernels.cu        |  20 ++-
 src/ops/lora_linear.cc                        |   3 +-
 src/ops/spec_inc_multihead_self_attention.cc  |   6 +-
 src/ops/tree_inc_multihead_self_attention.cc  |   6 +-
 src/parallel_ops/allreduce.cc                 |   4 +
 src/parallel_ops/parallel_identity.cc         |   5 +-
 src/runtime/model.cc                          |  30 +++--
 src/runtime/optimizer.cc                      |   5 +-
 src/runtime/optimizer_kernel.cpp              |  12 +-
 src/runtime/optimizer_kernel.cu               |  12 +-
 src/runtime/request_manager.cc                |  66 ++++++----
 .../python_test_configs/generate_configs.py   |   1 +
 28 files changed, 378 insertions(+), 155 deletions(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 6ca50027d1..9ee4693f91 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -56,7 +56,7 @@ jobs:
       CONDA: "3"    
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Keep alive
@@ -75,7 +75,7 @@ jobs:
       CONDA: "3"    
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -151,7 +151,7 @@ jobs:
       HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -239,7 +239,7 @@ jobs:
       CONDA: "3"
     needs: inference-tests
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index 82cf3b4122..abb4864588 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -36,11 +36,12 @@ if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
     string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
     string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
     set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
-    if(NCCL_VERSION VERSION_LESS 2.23)
-      set(NCCL_OLD TRUE)
-    else()
-      set(NCCL_OLD FALSE)
-    endif()
+    set(NCCL_OLD FALSE)
+    # if(NCCL_VERSION VERSION_LESS 2.23)
+    #   set(NCCL_OLD TRUE)
+    # else()
+    #   set(NCCL_OLD FALSE)
+    # endif()
     message(STATUS "Found NCCL version: ${NCCL_VERSION}")
   else()
     message(WARNING "NCCL header not found, unable to determine version")
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index ee13a07375..7028fc4b2e 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -55,18 +55,18 @@ ENV CUDA_DIR /usr/local/cuda
 ARG FF_GPU_BACKEND "cuda"
 
 # Update NCCL if FF_GPU_BACKEND is cuda
-RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
-        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
-        ubuntu_version=$(lsb_release -rs); \
-        ubuntu_version=${ubuntu_version//./}; \
-        wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
-        DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
-        DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
-        rm -f cuda-keyring_1.0-1_all.deb; \
-        DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
-    else \
-        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
-    fi'
+# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
+#         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
+#         ubuntu_version=$(lsb_release -rs); \
+#         ubuntu_version=${ubuntu_version//./}; \
+#         wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
+#         DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
+#         DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
+#         rm -f cuda-keyring_1.0-1_all.deb; \
+#         DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
+#     else \
+#         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
+#     fi'
 
 # Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
 # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 5360b5f8ea..eee9875d30 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -8,7 +8,8 @@
 #include "flexflow/ops/lora_linear.h"
 
 namespace FlexFlow {
-
+using Legion::Context;
+using Legion::Runtime;
 struct LoraLinearWeight {
   // weights
   void *w0_ptr, *w1_ptr;
@@ -46,7 +47,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output);
-void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+void peft_bwd_kernel_wrapper(Context ctx,
+                             Runtime *runtime,
+                             LoraLinearMeta *m,
                              BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad);
@@ -63,7 +66,9 @@ void inference_kernel(LoraLinearMeta *m,
                       int out_dim,
                       ffStream_t stream);
 template <typename DT>
-void peft_bwd_kernel(LoraLinearMeta *m,
+void peft_bwd_kernel(Context ctx,
+                     Runtime *runtime,
+                     LoraLinearMeta *m,
                      BatchConfig const *bc,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h
index bab7e6e4ed..4917df73c3 100644
--- a/include/flexflow/optimizer.h
+++ b/include/flexflow/optimizer.h
@@ -20,7 +20,8 @@
 #include "legion.h"
 
 namespace FlexFlow {
-
+using Legion::Context;
+using Legion::Runtime;
 class FFModel;
 class OpMeta;
 
@@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(SGDOptimizer const *op,
+  static void nccl_update_task_gpu(Context ctx,
+                                   Runtime *runtime,
+                                   SGDOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
@@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(AdamOptimizer const *op,
+  static void nccl_update_task_gpu(Context ctx,
+                                   Runtime *runtime,
+                                   AdamOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 36a56012fc..94bfc74244 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -68,7 +68,7 @@ struct Request {
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   int max_length = -1;
-  int max_new_tokens = 128;
+  int max_new_tokens = -1;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
@@ -302,6 +302,7 @@ class RequestManager {
   ModelType model_type;
   int bos_token_id;
   int eos_token_id;
+  bool old_llama_tokenizer = false;
   std::string output_filepath;
   std::queue<Request> pending_infr_request_queue;
   std::queue<Request> pending_peft_request_queue;
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index ee5bd1b460..14fc653eba 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
         printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
         Request inference_req;
         inference_req.prompt = text;
-        inference_req.max_length = 128;
+        inference_req.max_new_tokens = 128;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index a7d38a66b6..13da7aee20 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -162,7 +162,7 @@ def main():
             ff.Request(
                 ff.RequestType.REQ_INFERENCE,
                 prompt=prompt,
-                max_sequence_length=128,
+                max_new_tokens=128,
                 peft_model_id=llm.get_ff_peft_id(lora_inference_config),
             )
             for prompt in prompts
@@ -172,7 +172,6 @@ def main():
     if len(configs.finetuning_dataset) > 0:
         finetuning_request = ff.Request(
             ff.RequestType.REQ_FINETUNING,
-            max_sequence_length=128,
             peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
             dataset_filepath=configs.finetuning_dataset,
             max_training_steps=2,
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 1df5a05a8f..232ef1699c 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -51,12 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -71,6 +71,7 @@ def get_configs():
             "full_precision": False,
             "prompt": "",
             "output_file": "",
+            "max_length": 128,
         }
         # Merge dictionaries
         ff_init_configs.update(llm_configs)
@@ -106,9 +107,9 @@ def main():
         max_seq_length=256,
         max_tokens_per_batch=64,
     )
-    
+
     llm.start_server()
-    
+
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
         if "max_length" not in configs_dict:
@@ -119,8 +120,10 @@ def main():
         if "max_length" not in configs_dict:
             result = llm.generate("Three tips for staying healthy are: ")
         else:
-            result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)
-        
+            result = llm.generate(
+                "Three tips for staying healthy are: ", max_length=configs.max_length
+            )
+
     llm.stop_server()
 
 
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index 39529abda3..7ae752cffc 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -51,12 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -81,6 +81,7 @@ def get_configs():
             ],
             "prompt": "",
             "output_file": "",
+            "max_length": 128,
         }
         # Merge dictionaries
         ff_init_configs.update(llm_configs)
@@ -144,17 +145,26 @@ def main():
         max_tokens_per_batch=64,
         ssms=ssms,
     )
-    
+
     llm.start_server()
 
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
-        results = llm.generate(prompts)
+        if "max_length" not in configs_dict:
+            results = llm.generate(prompts)
+        else:
+            results = llm.generate(prompts, max_length=configs.max_length)
     else:
-        result = llm.generate("Three tips for staying healthy are: ")
-        
+        if "max_length" not in configs_dict:
+            result = llm.generate("Three tips for staying healthy are: ")
+        else:
+            result = llm.generate(
+                "Three tips for staying healthy are: ", max_length=configs.max_length
+            )
+
     llm.stop_server()
 
+
 if __name__ == "__main__":
     print("flexflow inference example (speculative inference)")
     main()
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 9b35b249d9..e2240f0b4f 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1795,7 +1795,7 @@ def __init__(
                 raise ValueError(
                     "Target modules can only be specified when trainable=True"
                 )
-        
+
         # Check rank, lora_alpha, lora_dropout values
         if rank is not None or lora_alpha is not None or lora_dropout is not None:
             if not trainable or not init_lora_weights:
@@ -1805,7 +1805,7 @@ def __init__(
         rank = rank if rank is not None else 8
         lora_alpha = lora_alpha if lora_alpha is not None else 8.0
         lora_dropout = lora_dropout if lora_dropout is not None else 0.0
-        
+
         # If passed, check if the values of rank, lora_alpha, and lora_dropout are valid
         if rank < 1 or type(rank) != int:
             raise ValueError("Rank must be >= 1 and an integer")
@@ -1813,7 +1813,7 @@ def __init__(
             raise ValueError("Lora_alpha must be > 0")
         if lora_dropout < 0 or lora_dropout > 1:
             raise ValueError("Lora_dropout must be in the interval [0, 1]")
-        
+
         self.ff_initialized = False
         self._cache_folder = cache_folder
         self._peft_model_id = peft_model_id
@@ -2051,13 +2051,15 @@ def no_id_handle():
 # Request
 # -----------------------------------------------------------------------
 
+
 @dataclass
 class Request:
     """A class to record the metadata of an inference or finetuning request."""
+
     req_type: RequestType
     prompt: Optional[str] = None
     max_length: int = -1
-    max_new_tokens: int = 128
+    max_new_tokens: int = -1
     peft_model_id: Optional[PEFTModelID] = None
     dataset_filepath: Optional[str] = None
     max_training_steps: int = 1
@@ -4650,26 +4652,65 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128):
+    def _estimate_max_num_tokens(
+        max_length: int, max_new_tokens: int, prompt: Optional[str]
+    ):
+        if prompt is None:
+            assert max_new_tokens == -1
+        return (
+            math.ceil(max_new_tokens + len(prompt.split()) * 1.5)
+            if max_new_tokens != -1
+            else max_length
+        )
+
+    def _estimate_max_num_chars(
+        max_length: int, max_new_tokens: int, prompt: Optional[str]
+    ):
+        return (
+            5 * FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt)
+            + 100
+        )
+
+    # deprecated
+    def generate_inf_only(
+        self,
+        prompt_list: List[str],
+        max_length: int,
+        max_new_tokens: int,
+    ):
         if max_length != -1 and max_new_tokens != -1:
-            warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.")
+            raise ValueError(
+                f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) seem to have been set."
+            )
+        if max_length == -1 and max_new_tokens == -1:
+            raise ValueError(
+                f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) were left unset."
+            )
         assert isinstance(prompt_list, list)
         c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length
-        max_num_chars = 5 * (estimated_max_tokens + 100)
-        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
+        c_output_texts = [
+            ffi.new(
+                "char[]",
+                FFModel._estimate_max_num_chars(max_length, max_new_tokens, prompt),
+            )
+            for prompt in prompt_list
+        ]
         c_output_length_and_tokens = [
-            ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list
+            ffi.new(
+                "int[]",
+                FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt)
+                + 100,
+            )
+            for prompt in prompt_list
         ]
         c_request_types = [
-            enum_to_int(RequestType, RequestType.REQ_INFERENCE)
-            for prompt in prompt_list
+            enum_to_int(RequestType, RequestType.REQ_INFERENCE) for _ in prompt_list
         ]
-        max_lengths = [max_length for prompt in prompt_list]
-        max_new_tokens_ = [max_new_tokens for prompt in prompt_list]
-        peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
-        dataset_filepaths = [ffi.NULL for prompt in prompt_list]
-        training_steps = [0 for prompt in prompt_list]
+        max_lengths = [max_length for _ in prompt_list]
+        max_new_tokens_ = [max_new_tokens for _ in prompt_list]
+        peft_model_ids = [PEFTModelID.no_id_handle() for _ in prompt_list]
+        dataset_filepaths = [ffi.NULL for _ in prompt_list]
+        training_steps = [0 for _ in prompt_list]
         num_finetuning_losses = ffi.new("int *")
         c_finetuning_losses = ffi.new("float[]", 0)
         ffc().flexflow_model_generate(
@@ -4698,34 +4739,55 @@ def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_ne
 
     def generate(self, requests_list: List[Request]):
         assert isinstance(requests_list, list)
+        for request in requests_list:
+            assert isinstance(request, Request)
+            if request.max_length != -1 and request.max_new_tokens != -1:
+                raise ValueError(
+                    f"Both `max_new_tokens` (={request.max_new_tokens}) and `max_length`(={request.max_length}) seem to have been set."
+                )
+            if request.max_length == -1 and request.max_new_tokens == -1:
+                raise ValueError(
+                    f"Both `max_new_tokens` (={request.max_new_tokens}) and `max_length`(={request.max_length}) were left unset."
+                )
+            if (
+                request.req_type == RequestType.REQ_FINETUNING
+                and request.max_new_tokens != -1
+            ):
+                raise ValueError(
+                    f"Finetuning requests should not have `max_new_tokens` set."
+                )
         c_input_texts = [
             get_c_name(request.prompt) for request in requests_list
         ]  # entry will be None for finetuning requests
         c_output_texts = [
             (
-                ffi.new("char[]", 5 * (request.max_sequence_length + 100))
+                ffi.new(
+                    "char[]",
+                    FFModel._estimate_max_num_chars(
+                        request.max_length, request.max_new_tokens, request.prompt
+                    ),
+                )
                 if request.req_type == RequestType.REQ_INFERENCE
                 else ffi.NULL
             )
             for request in requests_list
         ]
         c_output_length_and_tokens = [
-            ffi.new("int[]", request.max_sequence_length + 100)
+            ffi.new(
+                "int[]",
+                FFModel._estimate_max_num_tokens(
+                    request.max_length, request.max_new_tokens, request.prompt
+                )
+                + 100,
+            )
             for request in requests_list
         ]
         c_request_types = [
             enum_to_int(RequestType, request.req_type) for request in requests_list
         ]
-        max_lengths = [
-            request.max_length for request in requests_list
-        ]
-        max_new_tokens_ = [
-            request.max_new_tokens for request in requests_list
-        ]
-        for i in range(len(requests_list)):
-            if max_lengths[i] != -1 and max_new_tokens_[i] != -1:
-                warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.")
-        
+        max_lengths = [request.max_length for request in requests_list]
+        max_new_tokens_ = [request.max_new_tokens for request in requests_list]
+
         peft_model_ids = [
             (
                 request.peft_model_id
@@ -4742,7 +4804,7 @@ def generate(self, requests_list: List[Request]):
         # c_finetuning_losses = ffi.new("float**")
         # TODO: set this value automatically
         c_finetuning_losses = ffi.new("float[]", 10000)
-        
+
         ffc().flexflow_model_generate(
             self.handle,
             len(requests_list),
@@ -4774,7 +4836,6 @@ def generate(self, requests_list: List[Request]):
                     finetuning_losses=finetuning_losses,
                 )
             )
-        return results
 
     def set_position_offset(self, offset):
         ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index e3b6b47466..c8540a6ed3 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -27,15 +27,18 @@
     MPTConfig,
 )
 from flexflow.core import *
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
 import torch, shutil, hashlib, json, gc
 from typing import Union, List
+from huggingface_hub import snapshot_download
 
 
 class _SupportedModels:
-    def __init__(self,):
+    def __init__(
+        self,
+    ):
         self.supported_models = {
             "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
             "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
@@ -292,8 +295,8 @@ def download_peft_weights():
 
                     weights_path = get_weights_path(peft_model_id)
                     refresh_cache_if_needed(peft_model_id)
-                    ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-                        peft_model_id, weights_path
+                    ff_revision, ff_revision_file, latest_revision = (
+                        self.__get_revision_hashes(peft_model_id, weights_path)
                     )
 
                     if ff_revision != latest_revision:
@@ -349,10 +352,25 @@ def download_hf_tokenizer_if_needed(self):
             print(
                 f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
             )
-            # Download tokenizer from HuggingFace, or load it from the local folder
-            hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-            # Save tokenizer
-            hf_tokenizer.save_pretrained(self.tokenizer_path)
+            # Load/download the tokenizer files
+            target_tokenizer_files = [
+                "tokenizer.json",
+                "tokenizer_config.json",
+                "special_tokens_map.json",
+                "vocab.json",
+                "merges.txt",
+            ]
+            if os.path.exists(self.model_name):
+                hf_tokenizer_path = self.model_name
+            else:
+                hf_tokenizer_path = snapshot_download(
+                    repo_id=self.model_name, allow_patterns=target_tokenizer_files
+                )
+            for file in target_tokenizer_files:
+                src_path = os.path.join(hf_tokenizer_path, file)
+                dst_path = os.path.join(self.tokenizer_path, file)
+                if os.path.exists(src_path):
+                    shutil.copy(src_path, dst_path)
             print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
@@ -417,6 +435,8 @@ def compile(
                 model_specific_pipeline_parallelism_degree
             )
 
+        self.max_seq_length = max_seq_length
+
         # Create request manager and set serving configuration
         self.rm = RequestManager()
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
@@ -495,11 +515,44 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
+    def _generate(self, requests: List[Request]):
+        if len(requests) == 0:
+            return []
+        for req in requests:
+            if req.req_type == RequestType.REQ_INFERENCE:
+                # check max_length and max_new_tokens parameters
+                if req.max_length == -1 and req.max_new_tokens == -1:
+                    req.max_length = self.max_seq_length -1
+                elif req.max_length != -1 and req.max_new_tokens != -1:
+                    warnings.warn(
+                        f"Both `max_new_tokens` (={req.max_new_tokens}) and `max_length`(={req.max_length}) seem to have been set. `max_new_tokens` will take precedence."
+                    )
+                    req.max_length = -1
+                if (
+                    req.max_length >= self.max_seq_length
+                    or req.max_new_tokens >= self.max_seq_length
+                ):
+                    raise ValueError(
+                        f"max_length ({req.max_length}) or max_new_tokens ({req.max_new_tokens}) exceeds the maximum sequence length ({self.max_seq_length})"
+                    )
+            else:
+                if req.max_new_tokens != -1:
+                    raise ValueError(
+                        f"max_new_tokens ({req.max_new_tokens}) is not allowed for finetuning requests."
+                    )
+                if req.max_length == -1:
+                    req.max_length = self.max_seq_length -1
+                if req.max_length >= self.max_seq_length:
+                    raise ValueError(
+                        f"max_length ({req.max_length}) exceeds the maximum sequence length ({self.max_seq_length})"
+                    )
+        return self.model.ffmodel.generate(requests)
+
     def generate(
         self,
         requests_or_prompts: Union[str, List[str], Request, List[Request]],
         max_length: int = -1,
-        max_new_tokens: int = 128,
+        max_new_tokens: int = -1,
     ):
         """Generate tokens based on the input prompt(s)
 
@@ -514,24 +567,35 @@ def generate(
         """
         if type(requests_or_prompts) == str:
             if len(requests_or_prompts) == 0:
-                return None
-            return self.model.ffmodel.generate_inf_only(
-                [requests_or_prompts], max_length, max_new_tokens
+                return []
+            request = Request(
+                req_type=RequestType.REQ_INFERENCE,
+                prompt=requests_or_prompts,
+                max_length=max_length,
+                max_new_tokens=max_new_tokens,
             )
+            return self._generate([request])
         elif type(requests_or_prompts) == Request:
-            return self.model.ffmodel.generate(requests_or_prompts)
+            return self._generate([requests_or_prompts])
         elif type(requests_or_prompts) == list:
             if len(requests_or_prompts) == 0:
                 return []
             if type(requests_or_prompts[0]) == str:
-                return self.model.ffmodel.generate_inf_only(
-                    requests_or_prompts, max_length, max_new_tokens
-                )
+                requests = [
+                    Request(
+                        req_type=RequestType.REQ_INFERENCE,
+                        prompt=req,
+                        max_length=max_length,
+                        max_new_tokens=max_new_tokens,
+                    )
+                    for req in requests_or_prompts
+                ]
+                return self._generate(requests)
             else:
                 print(requests_or_prompts)
-                return self.model.ffmodel.generate(requests_or_prompts)
+                return self._generate(requests_or_prompts)
         else:
-            assert False, "Please pass a non-empty string or list of strings"
+            assert False, "Please pass a string, list of strings, Request, or list of Requests"
 
     def start_server(self):
         self.rm.start_server(self.model.ffmodel)
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 720d678a4a..984691fa66 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -476,7 +476,6 @@ void FusedOp::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -571,7 +570,6 @@ void FusedOp::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 2cede662f3..dfb524d206 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -612,8 +612,10 @@ __host__ void
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        runtime->concurrent_task_barrier(ctx);
         Kernels::AllReduce::inference_kernel_wrapper(
             m, bc, my_input_accessor[0], my_output_accessor[0]);
+        runtime->concurrent_task_barrier(ctx);
         break;
       }
       case OP_PARALLEL_IDENTITY: {
@@ -870,7 +872,12 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         // since we ``inplace'' the output for LoRA
         assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
         Kernels::LoraLinear::peft_bwd_kernel_wrapper(
-            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+            ctx,
+            runtime,
+            m,
+            bc,
+            my_input_grad_accessor[0],
+            my_output_grad_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -1129,8 +1136,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        runtime->concurrent_task_barrier(ctx);
         Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
             m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        runtime->concurrent_task_barrier(ctx);
         break;
       }
       default: {
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 5aed2cd69a..62845c0f8e 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -623,8 +623,10 @@ __host__ void
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        runtime->concurrent_task_barrier(ctx);
         Kernels::AllReduce::inference_kernel_wrapper(
             m, bc, my_input_accessor[0], my_output_accessor[0]);
+        runtime->concurrent_task_barrier(ctx);
         break;
       }
       case OP_PARALLEL_IDENTITY: {
@@ -888,7 +890,12 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         // since we ``inplace'' the output for LoRA
         assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
         Kernels::LoraLinear::peft_bwd_kernel_wrapper(
-            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+            ctx,
+            runtime,
+            m,
+            bc,
+            my_input_grad_accessor[0],
+            my_output_grad_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -1149,8 +1156,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        runtime->concurrent_task_barrier(ctx);
         Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
             m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        runtime->concurrent_task_barrier(ctx);
         break;
       }
       default: {
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index a4604a11a2..8818cd9673 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -147,7 +147,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 93e5820f9c..638cee8cae 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -96,7 +96,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
   }
 }
 
-void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+void peft_bwd_kernel_wrapper(Context ctx,
+                             Runtime *runtime,
+                             LoraLinearMeta *m,
                              BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
@@ -111,7 +113,9 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
   int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   if (m->input_type[0] == DT_FLOAT) {
-    Internal::peft_bwd_kernel<float>(m,
+    Internal::peft_bwd_kernel<float>(ctx,
+                                     runtime,
+                                     m,
                                      bc,
                                      input_grad.get_float_ptr(),
                                      output_grad.get_float_ptr(),
@@ -119,7 +123,9 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
                                      out_dim,
                                      stream);
   } else if (m->input_type[0] == DT_HALF) {
-    Internal::peft_bwd_kernel<half>(m,
+    Internal::peft_bwd_kernel<half>(ctx,
+                                    runtime,
+                                    m,
                                     bc,
                                     input_grad.get_half_ptr(),
                                     output_grad.get_half_ptr(),
@@ -361,7 +367,9 @@ __global__ void sgd_update(size_t count,
 }
 
 template <typename DT>
-void peft_bwd_kernel(LoraLinearMeta *m,
+void peft_bwd_kernel(Context ctx,
+                     Runtime *runtime,
+                     LoraLinearMeta *m,
                      BatchConfig const *bc,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
@@ -543,13 +551,15 @@ void peft_bwd_kernel(LoraLinearMeta *m,
         // and sum first
 #ifdef FF_USE_NCCL
         ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]);
-        checkCUDA(ncclAllReduce(static_cast<DT const *>(weight.w1_grad_ptr),
+        runtime->concurrent_task_barrier(ctx);
+        checkNCCL(ncclAllReduce(static_cast<DT const *>(weight.w1_grad_ptr),
                                 static_cast<DT *>(weight.w1_grad_ptr),
                                 w1_num_elements,
                                 nccl_data_type,
                                 ncclSum,
                                 m->handle.ncclComm,
                                 stream));
+        runtime->concurrent_task_barrier(ctx);
 #else
         assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 513147f3b7..3749cce994 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -296,7 +296,6 @@ void LoraLinear::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -1066,7 +1065,7 @@ void LoraLinear::peft_bwd_task(Task const *task,
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
-  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  peft_bwd_kernel_wrapper(ctx, runtime, m, bc, input_grad, output_grad);
 
   save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id);
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index aa74ecc6f5..6b2a4be507 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -170,7 +170,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
 
-  std::cout << "spec create operator: " << layer->name << "\n";
+  // std::cout << "spec create operator: " << layer->name << "\n";
   long long value;
   layer->get_int_property("embed_dim", value);
   int embed_dim = value;
@@ -182,10 +182,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   int kdim = value;
   layer->get_int_property("vdim", value);
   int vdim = value;
-  float dropout;
-  layer->get_float_property("dropout", dropout);
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  float dropout;
+  layer->get_float_property("dropout", dropout);
   RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
   rotary_embedding_meta.apply_rotary_embedding = (bool)value;
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index ae0795ac1e..ac0011d9eb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                        rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
+  li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
@@ -187,10 +188,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
   int kdim = value;
   layer->get_int_property("vdim", value);
   int vdim = value;
-  float dropout;
-  layer->get_float_property("dropout", dropout);
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  float dropout;
+  layer->get_float_property("dropout", dropout);
   RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
   rotary_embedding_meta.apply_rotary_embedding = (bool)value;
@@ -203,6 +204,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
                             rotary_embedding_meta.high_freq_factor);
   layer->get_int_property("original_max_position_embeddings", value);
   rotary_embedding_meta.original_max_position_embeddings = (int)value;
+  layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
   layer->get_float_property("scaling_factor", scaling_factor);
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index a4443c4066..6611a6bb1f 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -197,7 +197,9 @@ void AllReduce::forward_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input.data_type == output.data_type);
+  // runtime->concurrent_task_barrier(ctx);
   forward_kernel_wrapper(m, input, output);
+  // runtime->concurrent_task_barrier(ctx);
 }
 
 void AllReduce::backward(FFModel const &ff) {
@@ -347,7 +349,9 @@ void AllReduce::inference_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input.data_type == output.data_type);
+  // runtime->concurrent_task_barrier(ctx);
   inference_kernel_wrapper(m, bc, input, output);
+  // runtime->concurrent_task_barrier(ctx);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc
index 7d68036709..2f76897712 100644
--- a/src/parallel_ops/parallel_identity.cc
+++ b/src/parallel_ops/parallel_identity.cc
@@ -245,7 +245,9 @@ void ParallelIdentity::backward_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input_grad.data_type == output_grad.data_type);
+  // runtime->concurrent_task_barrier(ctx);
   backward_kernel_wrapper(m, input_grad, output_grad);
+  // runtime->concurrent_task_barrier(ctx);
 }
 
 void ParallelIdentity::init_inference(
@@ -270,7 +272,6 @@ void ParallelIdentity::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -422,7 +423,9 @@ void ParallelIdentity::peft_bwd_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input_grad.data_type == output_grad.data_type);
+  // runtime->concurrent_task_barrier(ctx);
   peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  // runtime->concurrent_task_barrier(ctx);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 69fe3b598d..417cd2c056 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1677,6 +1677,7 @@ void FFModel::finish_nccl_comms() {
                                  false /*must*/,
                                  0 /*mapper_id*/,
                                  comm.first);
+    index_launcher.concurrent = true;
     FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
     fm.wait_all_results();
   }
@@ -6899,7 +6900,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, LoraLinear::init_task>(
           registrar, "LoraLinear Init Task");
@@ -6932,6 +6932,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<LoraLinear::peft_bwd_task>(
           registrar, "LoraLinear PEFT Backward Task");
@@ -6963,7 +6964,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, FusedOp::init_task>(
           registrar, "FusedOp Init Task");
@@ -6979,6 +6979,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::inference_task>(
           registrar, "FusedOp Inference Task");
@@ -6995,6 +6996,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::peft_bwd_task>(
           registrar, "FusedOp PEFT Backward Task");
@@ -7011,6 +7013,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::forward_task>(
           registrar, "FusedOp Forward Task");
@@ -7026,6 +7029,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::backward_task>(
           registrar, "FusedOp Backward Task");
@@ -7262,7 +7266,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, AllReduce::init_task>(
           registrar, "AllReduce init Task");
@@ -7280,6 +7283,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     // AllReduce forward and backward must run concurrently since they
     // use ncclAllReduce internally
     registrar.set_concurrent();
+    // registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::forward_task>(
           registrar, "AllReduce Forward Task");
@@ -7294,9 +7298,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    // AllReduce forward and backward must run concurrently since they
-    // use ncclAllReduce internally
-    // registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::backward_task>(
           registrar, "AllReduce Backward Task");
@@ -7315,6 +7316,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     // AllReduce forward and backward must run concurrently since they
     // use ncclAllReduce internally
     registrar.set_concurrent();
+    // registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::inference_task>(
           registrar, "AllReduce Inference Task");
@@ -7330,9 +7332,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    // AllReduce forward and backward must run concurrently since they
-    // use ncclAllReduce internally
-    // registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::peft_bwd_task>(
           registrar, "AllReduce PEFT Backward Task");
@@ -7349,7 +7348,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "ParallelIdentity Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, ParallelIdentity::init_task>(
           registrar, "ParallelIdentity init Task");
@@ -7382,6 +7380,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    // registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<ParallelIdentity::backward_task>(
           registrar, "ParallelIdentity Backward Task");
@@ -7415,6 +7414,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    // registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<ParallelIdentity::peft_bwd_task>(
           registrar, "ParallelIdentity PEFT Backward Task");
@@ -7433,6 +7433,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "FusedParallel Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedParallelOp::forward_task>(
           registrar, "FusedParallel Forward Task");
@@ -7448,6 +7450,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "FusedParallel Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedParallelOp::backward_task>(
           registrar, "FusedParallel Backward Task");
@@ -7496,6 +7500,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<SGDOptimizer::nccl_update_task>(
           registrar, "SGD NCCL Update Task", 111 /*variant ID*/);
@@ -7511,6 +7516,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ADAM_UPD_NCCL_TASK_ID, "Adam NCCL Update");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AdamOptimizer::nccl_update_task>(
           registrar, "Adam NCCL Update Task", 111 /*variant ID*/);
@@ -7648,6 +7655,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    // registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
           registrar, "NCCL Init Communicators Task", 111 /*variant ID*/);
@@ -7664,6 +7672,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "NCCL Finish Communicators");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    // registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<Op::finish_nccl_comms_task>(
           registrar, "NCCL Finish Communicators Task", 111 /*variant ID*/);
diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc
index c42a0c9aa6..96b735803c 100644
--- a/src/runtime/optimizer.cc
+++ b/src/runtime/optimizer.cc
@@ -311,7 +311,7 @@ void SGDOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr);
+  nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr);
 }
 #endif
 
@@ -603,7 +603,8 @@ void AdamOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
+  nccl_update_task_gpu(
+      ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
 }
 #endif
 
diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp
index 59efaf5256..9b0d3c8892 100644
--- a/src/runtime/optimizer_kernel.cpp
+++ b/src/runtime/optimizer_kernel.cpp
@@ -86,7 +86,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Context ctx,
+                                                 Runtime *runtime,
+                                                 SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
                                                  size_t size,
@@ -96,6 +98,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -103,6 +106,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
 
   // Step 2: SGD update
@@ -208,7 +212,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Context ctx,
+                                                  Runtime *runtime,
+                                                  AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
                                                   size_t size,
@@ -218,6 +224,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -225,6 +232,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu
index df37e3b135..72ee74940f 100644
--- a/src/runtime/optimizer_kernel.cu
+++ b/src/runtime/optimizer_kernel.cu
@@ -75,7 +75,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Context ctx,
+                                                 Runtime *runtime,
+                                                 SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
                                                  size_t size,
@@ -85,6 +87,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -92,6 +95,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
@@ -183,7 +187,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Context ctx,
+                                                  Runtime *runtime,
+                                                  AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
                                                   size_t size,
@@ -193,6 +199,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -200,6 +207,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 44b181fcb3..5fbee65e6d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -186,28 +186,35 @@ void RequestManager::register_tokenizer(ModelType type,
   std::filesystem::path tokenizer_folder(path);
 
   if (model_type == ModelType::LLAMA) {
-    std::filesystem::path tokenizer_model_path;
+    // try with tokenizer.json first
+    std::filesystem::path tokenizer_json_path;
     if (std::filesystem::is_directory(tokenizer_folder)) {
-      tokenizer_model_path =
-          std::filesystem::path(tokenizer_folder) / "tokenizer.model";
+      tokenizer_json_path =
+          std::filesystem::path(tokenizer_folder) / "tokenizer.json";
     } else {
-      tokenizer_model_path = tokenizer_folder;
+      tokenizer_json_path = tokenizer_folder;
     }
-    if (std::filesystem::exists(tokenizer_model_path)) {
-      // load from tokenizer.model
-      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
-          LoadBytesFromFile(tokenizer_model_path.string()));
-    } else {
+    if (std::filesystem::exists(tokenizer_json_path)) {
       // load from tokenizer.json
-      std::filesystem::path tokenizer_json_path =
-          tokenizer_folder / "tokenizer.json";
-      if (!std::filesystem::exists(tokenizer_json_path)) {
-        std::cerr << "Failed to open file: " << tokenizer_json_path
+      this->tokenizer_ = Tokenizer::FromBlobJSON(
+          LoadBytesFromFile(tokenizer_json_path.string()));
+    } else {
+      // load from tokenizer.model
+      std::filesystem::path tokenizer_model_path;
+      if (std::filesystem::is_directory(tokenizer_folder)) {
+        tokenizer_model_path =
+            std::filesystem::path(tokenizer_folder) / "tokenizer.model";
+      } else {
+        tokenizer_model_path = tokenizer_folder;
+      }
+      if (!std::filesystem::exists(tokenizer_model_path)) {
+        std::cerr << "Failed to open file: " << tokenizer_model_path
                   << std::endl;
         assert(false);
       }
-      this->tokenizer_ = Tokenizer::FromBlobJSON(
-          LoadBytesFromFile(tokenizer_json_path.string()));
+      old_llama_tokenizer = true;
+      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
+          LoadBytesFromFile(tokenizer_model_path.string()));
     }
   } else if (model_type == ModelType::OPT) {
     std::filesystem::path vocab_file = tokenizer_folder / "vocab.json";
@@ -264,7 +271,13 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_length = request_.max_length;
   request.max_new_tokens = request_.max_new_tokens;
+  // both unset
+  if (request.max_length == -1 && request.max_new_tokens == -1) {
+    request.max_length = get_max_sequence_length() - 1;
+  }
+  // both set
   if (request.max_length != -1 && request.max_new_tokens != -1) {
+    request.max_length = -1;
     std::cout
         << "Both `max_new_tokens` (=" << request.max_new_tokens
         << ") and `max_length`(=" << request.max_length
@@ -365,15 +378,14 @@ RequestManager::RequestGuid
   request.initial_len = 0;
   request.max_length = request_.max_length;
   request.max_new_tokens = request_.max_new_tokens;
-  if (request.max_length != -1) {
-    std::cout << "Warning: max_length is set for PEFT finetuning, but it will "
-                 "be ignored."
-              << std::endl;
-  }
   if (request.max_new_tokens != -1) {
-    std::cout << "Warning: max_new_tokens is set for PEFT finetuning, but "
-                 "it will be ignored."
-              << std::endl;
+    std::cerr
+        << "Error: max_new_tokens is not allowed for PEFT finetuning requests"
+        << std::endl;
+    assert(false);
+  }
+  if (request.max_length == -1) {
+    request.max_length = get_max_sequence_length() - 1;
   }
   request.peft_model_id = request_.peft_model_id;
   request.req_type = RequestType::REQ_FINETUNING;
@@ -660,7 +672,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
+        if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1121,7 +1133,7 @@ BeamSearchBatchConfig
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
+        if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1264,7 +1276,7 @@ BeamSearchBatchConfig
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
+        if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1312,7 +1324,7 @@ BeamSearchBatchConfig
       std::string output = this->tokenizer_->Decode(request.tokens);
       // Unlike Huggingface, the sentencepiece C++ library automatically removes
       // the BOS token
-      if (model_type == ModelType::LLAMA &&
+      if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
           request.tokens.at(0) == bos_token_id) {
         output = "<s> " + output;
       }
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 0a745c7984..2720304d4f 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -34,6 +34,7 @@
     "full_precision": True,
     "prompt": "",
     "output_file": "",
+    "max_length": 128,
 }
 ssm_configs = {
     "ssms": [

From d8355cae0197f35425f3c4164fdcdb23717ea293 Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Sat, 19 Oct 2024 17:26:17 +0000
Subject: [PATCH 33/44] docker fix

---
 docker/build.sh                               | 21 +++++-
 docker/flexflow-environment/Dockerfile        | 15 +++--
 .../flexflow-environment/install_pytorch.sh   | 67 +++++++++++++++++++
 3 files changed, 97 insertions(+), 6 deletions(-)
 create mode 100755 docker/flexflow-environment/install_pytorch.sh

diff --git a/docker/build.sh b/docker/build.sh
index b68860712f..3b7a6992df 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -1,5 +1,6 @@
 #! /usr/bin/env bash
 set -euo pipefail
+set -x
 
 # Usage: ./build.sh <docker_image_name>
 # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
@@ -102,7 +103,16 @@ if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then
   exit 0
 fi
 
-docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" --build-arg "python_version=${python_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile .
+docker build \
+  --build-arg "ff_environment_base_image=${ff_environment_base_image}" \
+  --build-arg "N_BUILD_CORES=${n_build_cores}" \
+  --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" \
+  --build-arg "cuda_version=${cuda_version}" \
+  --build-arg "hip_version=${hip_version}" \
+  --build-arg "python_version=${python_version}" \
+  -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" \
+  -f docker/flexflow-environment/Dockerfile \
+  .
 
 # If the user only wants to build the environment image, we are done
 if [[ "$image" == "flexflow-environment" ]]; then
@@ -162,4 +172,11 @@ fi
 # Set value of BUILD_CONFIGS
 get_build_configs
 
-docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "gpu_backend_version=${gpu_backend_version}" -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow/Dockerfile .
+docker build \
+  --build-arg "N_BUILD_CORES=${n_build_cores}" \
+  --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" \
+  --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" \
+  --build-arg "gpu_backend_version=${gpu_backend_version}" \
+  -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" \
+  -f docker/flexflow/Dockerfile \
+  .
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 7028fc4b2e..373331f0e7 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -4,6 +4,8 @@ FROM ${ff_environment_base_image}
 LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
 LABEL org.opencontainers.image.description="FlexFlow environment container"
 
+SHELL ["/bin/bash", "-c"]
+
 # Install basic dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \
     rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \
@@ -53,6 +55,8 @@ ENV CUDA_DIR /usr/local/cuda
 
 # GPU-specific dependencies
 ARG FF_GPU_BACKEND "cuda"
+ARG cuda_version ""
+ARG hip_version "5.6"
 
 # Update NCCL if FF_GPU_BACKEND is cuda
 # RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
@@ -73,7 +77,6 @@ ARG FF_GPU_BACKEND "cuda"
 # package attempts to re-install cuda even though cuda is already installed
 # in the container. It also attempts to install packages for a graphical install.
 # For our container, we don't need `hip-runtime-nvidia`
-ARG hip_version "5.6"
 RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \
         # Check that hip_version is one of 5.3,5.4,5.5,5.6
@@ -106,9 +109,13 @@ RUN rm -rf /var/lib/apt/lists/*
 
 # Install python packages and other dependencies
 RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
-# Install CPU-only Pytorch and related dependencies
-RUN conda install pytorch torchvision torchaudio -c pytorch
-RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
+# Install Pytorch
+COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytorch.sh
+RUN chmod +x /usr/local/bin/install_pytorch.sh && \
+    /usr/local/bin/install_pytorch.sh ${cuda_version} && \
+    rm /usr/local/bin/install_pytorch.sh
+# Various dependencies
+RUN pip3 install transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
 # PEFT-related
 RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
diff --git a/docker/flexflow-environment/install_pytorch.sh b/docker/flexflow-environment/install_pytorch.sh
new file mode 100755
index 0000000000..144b080e23
--- /dev/null
+++ b/docker/flexflow-environment/install_pytorch.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Check if CUDA version is supplied
+if [ -z "$1" ]; then
+  echo "Please provide the CUDA version as XX.Y (e.g., 11.8)"
+  exit 1
+fi
+
+# Extract major and minor version from input
+CUDA_VERSION=$1
+MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d '.' -f 1)
+MINOR_VERSION=$(echo "$CUDA_VERSION" | cut -d '.' -f 2)
+
+# Function to install PyTorch
+install_pytorch() {
+  local major=$1
+  local minor=$2
+
+  echo "Attempting to install PyTorch with CUDA ${major}.${minor} support..."
+
+  # Run dry-run first
+  if pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor} --dry-run; then
+    echo "Dry-run succeeded, proceeding with actual installation..."
+    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor}
+    return 0
+  else
+    echo "Dry-run failed for CUDA ${major}.${minor}."
+    return 1
+  fi
+}
+
+# Try to install with provided CUDA version or lower
+while [ "$MINOR_VERSION" -ge 0 ]; do
+  if install_pytorch "$MAJOR_VERSION" "$MINOR_VERSION"; then
+    echo "PyTorch installation successful with CUDA ${MAJOR_VERSION}.${MINOR_VERSION}"
+    exit 0
+  else
+    # Decrease the minor version
+    MINOR_VERSION=$((MINOR_VERSION - 1))
+
+    # Abort if minor version is less than 0 (all <= input failed)
+    if [ "$MINOR_VERSION" -lt 0 ]; then
+      echo "All minor versions <= input failed. Searching for the smallest minor version."
+    fi
+  fi
+done
+
+# Now attempt to find the smallest available minor version >= 0
+MINOR_VERSION=0
+echo "Starting search for the smallest minor version..."
+
+while true; do
+  if install_pytorch "$MAJOR_VERSION" "$MINOR_VERSION"; then
+    echo "PyTorch installation successful with CUDA ${MAJOR_VERSION}.${MINOR_VERSION}"
+    exit 0
+  else
+    # Increase minor version to search for available one
+    MINOR_VERSION=$((MINOR_VERSION + 1))
+
+    # Stop if no valid version is found after a certain number of tries
+    # For practical purposes, let's assume we won't go beyond minor version 10
+    if [ "$MINOR_VERSION" -gt 10 ]; then
+      echo "No valid PyTorch installation found for CUDA ${MAJOR_VERSION}. Aborting."
+      exit 1
+    fi
+  fi
+done

From bf6be8cd72ef74acc4bf0e9be698dac5abd139fa Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Sat, 19 Oct 2024 17:30:28 +0000
Subject: [PATCH 34/44] shellcheck

---
 docker/flexflow-environment/install_pytorch.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/flexflow-environment/install_pytorch.sh b/docker/flexflow-environment/install_pytorch.sh
index 144b080e23..9fe5151877 100755
--- a/docker/flexflow-environment/install_pytorch.sh
+++ b/docker/flexflow-environment/install_pytorch.sh
@@ -19,9 +19,9 @@ install_pytorch() {
   echo "Attempting to install PyTorch with CUDA ${major}.${minor} support..."
 
   # Run dry-run first
-  if pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor} --dry-run; then
+  if pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu"${major}${minor}" --dry-run; then
     echo "Dry-run succeeded, proceeding with actual installation..."
-    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor}
+    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu"${major}${minor}"
     return 0
   else
     echo "Dry-run failed for CUDA ${major}.${minor}."

From 4f6990f4ebd3c1a2cbe4e7bd3e67daa7430c6536 Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Sat, 19 Oct 2024 17:40:30 +0000
Subject: [PATCH 35/44] update

---
 docker/flexflow-environment/Dockerfile | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 373331f0e7..596d099f79 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -111,9 +111,15 @@ RUN rm -rf /var/lib/apt/lists/*
 RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
 # Install Pytorch
 COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytorch.sh
-RUN chmod +x /usr/local/bin/install_pytorch.sh && \
-    /usr/local/bin/install_pytorch.sh ${cuda_version} && \
-    rm /usr/local/bin/install_pytorch.sh
+RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing PyTorch with CUDA"; \
+        chmod +x /usr/local/bin/install_pytorch.sh && \
+        /usr/local/bin/install_pytorch.sh ${cuda_version} && \
+        rm /usr/local/bin/install_pytorch.sh; \
+    else \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing CPU-only PyTorch"; \
+        pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu; \
+    fi
 # Various dependencies
 RUN pip3 install transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook

From 89f10f4257887a3288435bbd7aa4bb0e628b8a33 Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Sat, 19 Oct 2024 17:41:32 +0000
Subject: [PATCH 36/44] update

---
 docker/flexflow-environment/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 596d099f79..d571befdda 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -114,12 +114,12 @@ COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytor
 RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing PyTorch with CUDA"; \
         chmod +x /usr/local/bin/install_pytorch.sh && \
-        /usr/local/bin/install_pytorch.sh ${cuda_version} && \
-        rm /usr/local/bin/install_pytorch.sh; \
+        /usr/local/bin/install_pytorch.sh ${cuda_version}; \
     else \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing CPU-only PyTorch"; \
         pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu; \
     fi
+RUN rm /usr/local/bin/install_pytorch.sh
 # Various dependencies
 RUN pip3 install transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook

From d09ba0c26c11c1d5cd4f7f3935cbbb585d4de18c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 4 Nov 2024 11:52:24 -0500
Subject: [PATCH 37/44] ChatCompletion + Multi-EOS support (#1535)

* init

* support templates

* support for multiple eos token ids

* fix

* fix

* fix conda env for ci
---
 conda/flexflow.yml                       |   6 +-
 include/flexflow/flexflow_c.h            |   7 +-
 include/flexflow/request_manager.h       |   6 +-
 inference/incr_decoding/incr_decoding.cc |  19 +++-
 inference/peft/peft.cc                   |  19 +++-
 inference/peft/peft_bwd_benchmark.cc     |  19 +++-
 inference/peft/peft_fwd_benchmark.cc     |  19 +++-
 inference/peft/req_rate_benchmark.cc     |  19 +++-
 inference/python/chat.py                 | 100 ++++++++++++++++++++
 inference/spec_infer/spec_infer.cc       |  41 +++++---
 python/flexflow/core/flexflow_cffi.py    | 115 +++--------------------
 python/flexflow/serve/serve.py           |  54 ++++++++++-
 src/c/flexflow_c.cc                      |  28 ++++--
 src/runtime/request_manager.cc           |  32 +++++--
 14 files changed, 327 insertions(+), 157 deletions(-)
 create mode 100644 inference/python/chat.py

diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index 091ba929e4..771b40ecd5 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -16,9 +16,9 @@ dependencies:
     - qualname>=0.1.0
     - keras_preprocessing>=1.1.2
     - numpy>=1.16.0
-    - torch>=1.13.1 --index-url https://download.pytorch.org/whl/cpu
-    - torchaudio>=0.13.1 --index-url https://download.pytorch.org/whl/cpu
-    - torchvision>=0.14.1 --index-url https://download.pytorch.org/whl/cpu
+    - torch>=1.13.1
+    - torchaudio>=0.13.1
+    - torchvision>=0.14.1
     - regex
     - onnx
     - transformers>=4.31.0
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 52f67d8efb..6501b0658c 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -653,6 +653,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              char **output_texts,
                              int *max_lengths,
                              int *max_new_tokens_,
+                             bool *add_special_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,
@@ -1019,6 +1020,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num(
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+int flexflow_request_manager_get_max_sequence_length(
+    flexflow_request_manager_t handle_);
+
 void flexflow_request_manager_set_enable_peft_finetuning(
     flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
 
@@ -1026,7 +1030,8 @@ void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
     int bos_token_id,
-    int eos_token_id,
+    int num_eos_token_ids,
+    int *eos_token_ids,
     char const *tokenizer_filepath);
 
 void flexflow_request_manager_register_output_filepath(
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 94bfc74244..d62b610f3d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -69,6 +69,7 @@ struct Request {
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   int max_length = -1;
   int max_new_tokens = -1;
+  bool add_special_tokens = true;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
@@ -146,7 +147,7 @@ class RequestManager {
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
-                          int eos_token_id,
+                          std::vector<int> eos_token_ids,
                           std::string const &path);
   void register_output_filepath(std::string const &);
   void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
@@ -178,6 +179,7 @@ class RequestManager {
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
   // Methods for preparing next batches
+  bool is_eos_token(int token_id);
   bool check_inf_req_completion(BatchConfig const &old_bc, int i);
   void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
@@ -301,7 +303,7 @@ class RequestManager {
   bool verbose;
   ModelType model_type;
   int bos_token_id;
-  int eos_token_id;
+  std::vector<int> eos_token_ids;
   bool old_llama_tokenizer = false;
   std::string output_filepath;
   std::queue<Request> pending_infr_request_queue;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index f8e16f24fa..f148d440e2 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -199,9 +199,20 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
@@ -212,7 +223,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 14fc653eba..0ab0b62ee8 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -229,9 +229,20 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
@@ -267,7 +278,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
 
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index df9a1e35db..85e97ec4e8 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -230,9 +230,20 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
@@ -251,7 +262,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
 
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
index 9b020f5954..87322a42dd 100644
--- a/inference/peft/peft_fwd_benchmark.cc
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -230,9 +230,20 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
@@ -251,7 +262,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
 
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index cde3b1c02e..ffa77478e1 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -292,9 +292,20 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
@@ -313,7 +324,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
 
diff --git a/inference/python/chat.py b/inference/python/chat.py
new file mode 100644
index 0000000000..13ece116a6
--- /dev/null
+++ b/inference/python/chat.py
@@ -0,0 +1,100 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    # Define sample configs
+    ff_init_configs = {
+        # required parameters
+        "num_gpus": 1,
+        "memory_per_gpu": 30000,
+        "zero_copy_memory_per_node": 60000,
+        # optional parameters
+        "num_cpus": 4,
+        "legion_utility_processors": 4,
+        "data_parallelism_degree": 1,
+        "tensor_parallelism_degree": 1,
+        "pipeline_parallelism_degree": 1,
+        "offload": False,
+        "offload_reserve_space_size": 8 * 1024,  # 8GB
+        "use_4bit_quantization": False,
+        "use_8bit_quantization": False,
+        "enable_peft": False,
+        "peft_activation_reserve_space_size": 1024,  # 1GB
+        "peft_weight_reserve_space_size": 1024,  # 1GB
+        "profiling": False,
+        "benchmarking": False,
+        "inference_debugging": False,
+        "fusion": True,
+    }
+    llm_configs = {
+        # required parameters
+        "llm_model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        # optional parameters
+        "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+        "refresh_cache": False,
+        "full_precision": False,
+    }
+    # Merge dictionaries
+    ff_init_configs.update(llm_configs)
+    return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.llm_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+    )
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=2048,
+        max_tokens_per_batch=256,
+    )
+
+    llm.start_server()
+
+    messages=[
+        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    llm.generate(messages, max_new_tokens=256)
+    
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow inference example (incremental decoding)")
+    main()
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 134ae70c4a..7ec3cf61f5 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -47,7 +47,8 @@ struct ModelMeta {
   std::string llm_weights_path;
   std::string llm_model_config_path;
 
-  int bos_token_id, eos_token_id;
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
 
   std::vector<ModelType> ssm_model_types;
   std::vector<std::string> ssm_model_config_paths;
@@ -191,10 +192,20 @@ void get_model_meta(FilePaths &file_paths,
       llm_model_config.find("bos_token_id") == llm_model_config.end()
           ? -1
           : (int)llm_model_config.at("bos_token_id");
-  model_metadata.eos_token_id =
-      llm_model_config.find("eos_token_id") == llm_model_config.end()
-          ? -1
-          : (int)llm_model_config.at("eos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
 
   for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
     std::string ssm_config_path = join_path({file_paths.cache_folder_path,
@@ -241,15 +252,15 @@ void get_model_meta(FilePaths &file_paths,
         ssm_model_config.find("bos_token_id") == ssm_model_config.end()
             ? -1
             : (int)ssm_model_config.at("bos_token_id");
-    int ssm_eos_id =
-        ssm_model_config.find("eos_token_id") == ssm_model_config.end()
-            ? -1
-            : (int)ssm_model_config.at("eos_token_id");
-    if (ssm_bos_id != model_metadata.bos_token_id ||
-        ssm_eos_id != model_metadata.eos_token_id) {
-      printf("Warning: bos/eos token id mismatch between LLM and one of the "
-             "SSMs!\n");
-    }
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
     model_metadata.ssm_model_types.push_back(ssm_model_type);
     model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
     model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
@@ -310,7 +321,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
-                         model_metadata.eos_token_id,
+                         model_metadata.eos_token_ids,
                          model_metadata.llm_tokenizer_path);
   rm->register_output_filepath(file_paths.output_file_path);
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index e2240f0b4f..59e62ea023 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1588,7 +1588,12 @@ def register_tokenizer(
         c_model_type = enum_to_int(ModelType, model_type)
         c_tokenizer_filepath = get_c_name(tokenizer_filepath)
         return ffc().flexflow_request_manager_register_tokenizer(
-            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+            self.handle,
+            c_model_type,
+            bos_token_id,
+            len(eos_token_id),
+            eos_token_id,
+            c_tokenizer_filepath,
         )
 
     def register_output_filepath(self, output_filepath):
@@ -1622,6 +1627,9 @@ def set_max_sequence_length(self, max_length):
             self.handle, max_length
         )
 
+    def get_max_sequence_length(self):
+        return ffc().flexflow_request_manager_get_max_sequence_length(self.handle)
+
     def set_enable_peft_finetuning(self, enable_peft_finetuning):
         return ffc().flexflow_request_manager_set_enable_peft_finetuning(
             self.handle, enable_peft_finetuning
@@ -2060,6 +2068,7 @@ class Request:
     prompt: Optional[str] = None
     max_length: int = -1
     max_new_tokens: int = -1
+    add_special_tokens: bool = True
     peft_model_id: Optional[PEFTModelID] = None
     dataset_filepath: Optional[str] = None
     max_training_steps: int = 1
@@ -4652,91 +4661,6 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def _estimate_max_num_tokens(
-        max_length: int, max_new_tokens: int, prompt: Optional[str]
-    ):
-        if prompt is None:
-            assert max_new_tokens == -1
-        return (
-            math.ceil(max_new_tokens + len(prompt.split()) * 1.5)
-            if max_new_tokens != -1
-            else max_length
-        )
-
-    def _estimate_max_num_chars(
-        max_length: int, max_new_tokens: int, prompt: Optional[str]
-    ):
-        return (
-            5 * FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt)
-            + 100
-        )
-
-    # deprecated
-    def generate_inf_only(
-        self,
-        prompt_list: List[str],
-        max_length: int,
-        max_new_tokens: int,
-    ):
-        if max_length != -1 and max_new_tokens != -1:
-            raise ValueError(
-                f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) seem to have been set."
-            )
-        if max_length == -1 and max_new_tokens == -1:
-            raise ValueError(
-                f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) were left unset."
-            )
-        assert isinstance(prompt_list, list)
-        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        c_output_texts = [
-            ffi.new(
-                "char[]",
-                FFModel._estimate_max_num_chars(max_length, max_new_tokens, prompt),
-            )
-            for prompt in prompt_list
-        ]
-        c_output_length_and_tokens = [
-            ffi.new(
-                "int[]",
-                FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt)
-                + 100,
-            )
-            for prompt in prompt_list
-        ]
-        c_request_types = [
-            enum_to_int(RequestType, RequestType.REQ_INFERENCE) for _ in prompt_list
-        ]
-        max_lengths = [max_length for _ in prompt_list]
-        max_new_tokens_ = [max_new_tokens for _ in prompt_list]
-        peft_model_ids = [PEFTModelID.no_id_handle() for _ in prompt_list]
-        dataset_filepaths = [ffi.NULL for _ in prompt_list]
-        training_steps = [0 for _ in prompt_list]
-        num_finetuning_losses = ffi.new("int *")
-        c_finetuning_losses = ffi.new("float[]", 0)
-        ffc().flexflow_model_generate(
-            self.handle,
-            len(prompt_list),
-            c_request_types,
-            c_input_texts,
-            c_output_texts,
-            max_lengths,
-            max_new_tokens_,
-            peft_model_ids,
-            dataset_filepaths,
-            training_steps,
-            c_output_length_and_tokens,
-            num_finetuning_losses,
-            c_finetuning_losses,
-        )
-        from flexflow.serve import GenerationResult
-
-        return [
-            GenerationResult(
-                text=ffi.string(c_output_text), tokens=[], finetuning_losses=[]
-            )
-            for c_output_text in c_output_texts
-        ]
-
     def generate(self, requests_list: List[Request]):
         assert isinstance(requests_list, list)
         for request in requests_list:
@@ -4756,37 +4680,27 @@ def generate(self, requests_list: List[Request]):
                 raise ValueError(
                     f"Finetuning requests should not have `max_new_tokens` set."
                 )
+        max_sequence_length = RequestManager().get_max_sequence_length()
         c_input_texts = [
             get_c_name(request.prompt) for request in requests_list
         ]  # entry will be None for finetuning requests
         c_output_texts = [
             (
-                ffi.new(
-                    "char[]",
-                    FFModel._estimate_max_num_chars(
-                        request.max_length, request.max_new_tokens, request.prompt
-                    ),
-                )
+                ffi.new("char[]", max_sequence_length * 5)
                 if request.req_type == RequestType.REQ_INFERENCE
                 else ffi.NULL
             )
             for request in requests_list
         ]
         c_output_length_and_tokens = [
-            ffi.new(
-                "int[]",
-                FFModel._estimate_max_num_tokens(
-                    request.max_length, request.max_new_tokens, request.prompt
-                )
-                + 100,
-            )
-            for request in requests_list
+            ffi.new("int[]", max_sequence_length + 100) for request in requests_list
         ]
         c_request_types = [
             enum_to_int(RequestType, request.req_type) for request in requests_list
         ]
         max_lengths = [request.max_length for request in requests_list]
         max_new_tokens_ = [request.max_new_tokens for request in requests_list]
+        add_special_tokens_ = [request.add_special_tokens for request in requests_list]
 
         peft_model_ids = [
             (
@@ -4813,6 +4727,7 @@ def generate(self, requests_list: List[Request]):
             c_output_texts,
             max_lengths,
             max_new_tokens_,
+            add_special_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index c8540a6ed3..e4248a2fc1 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -27,7 +27,7 @@
     MPTConfig,
 )
 from flexflow.core import *
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
 import torch, shutil, hashlib, json, gc
@@ -104,6 +104,7 @@ def __init__(
         self.output_file = output_file
         self.rm = None
         self.pefts = {}
+        self.tokenizer=None
 
     def __del__(self):
         # Stop the background server before deleting the object
@@ -499,6 +500,10 @@ def compile(
         eos_token_id = (
             -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
         )
+        if type(eos_token_id) == int:
+            eos_token_id = [eos_token_id]
+        elif type(eos_token_id) != list:
+            raise ValueError("eos_token_id must be an integer or a list of integers")
         self.rm.register_tokenizer(
             self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
         )
@@ -548,9 +553,29 @@ def _generate(self, requests: List[Request]):
                     )
         return self.model.ffmodel.generate(requests)
 
+    def __chat2prompt(self, messages: List[dict]):
+        """Convert a list of messages to a single prompt string
+
+        :param messages: The list of messages to convert
+        :type messages: List[dict]
+        :return: The prompt string
+        :rtype: str
+        """
+        # ensure that each element is a dictionary, containing the "role" and "content" keys
+        for message in messages:
+            if type(message) != dict or "role" not in message or "content" not in message:
+                raise ValueError(
+                    "Each element in the list must be a dictionary with the keys 'role' and 'content'"
+                )
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        if self.tokenizer.chat_template is None:
+            raise ValueError(f"Model {self.model_name} does not support chat completion")
+        return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
     def generate(
         self,
-        requests_or_prompts: Union[str, List[str], Request, List[Request]],
+        requests_or_prompts: Union[str, List[str], List[dict], Request, List[Request]],
         max_length: int = -1,
         max_new_tokens: int = -1,
     ):
@@ -591,7 +616,30 @@ def generate(
                     for req in requests_or_prompts
                 ]
                 return self._generate(requests)
-            else:
+            elif type(requests_or_prompts[0]) == dict:
+                prompt = self.__chat2prompt(requests_or_prompts)
+                request = Request(
+                    req_type=RequestType.REQ_INFERENCE,
+                    prompt=prompt,
+                    max_length=max_length,
+                    max_new_tokens=max_new_tokens,
+                    add_special_tokens=False,
+                )
+                return self._generate([request])
+            elif type(requests_or_prompts[0]) == list:
+                prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts]
+                requests = [
+                    Request(
+                        req_type=RequestType.REQ_INFERENCE,
+                        prompt=prompt,
+                        max_length=max_length,
+                        max_new_tokens=max_new_tokens,
+                        add_special_tokens=False,
+                    )
+                    for prompt in prompts
+                ]
+                return self._generate(requests)
+            elif type(requests_or_prompts[0]) == Request:
                 print(requests_or_prompts)
                 return self._generate(requests_or_prompts)
         else:
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index bfa60a6d54..da90c586e3 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1685,6 +1685,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              char **output_texts,
                              int *max_lengths,
                              int *max_new_tokens_,
+                             bool *add_special_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,
@@ -1701,22 +1702,25 @@ void flexflow_model_generate(flexflow_model_t handle_,
       inference_req.prompt = text_str;
       inference_req.max_length = max_lengths[i];
       inference_req.max_new_tokens = max_new_tokens_[i];
+      inference_req.add_special_tokens = add_special_tokens_[i];
       PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
       if (peft_model_id != nullptr) {
         inference_req.peft_model_id = *peft_model_id;
       }
       requests.push_back(inference_req);
-      DEBUG_PRINT("[Model] generate[%d] %p %s %i %i",
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i %i %i",
                   i,
                   handle,
                   text_str.c_str(),
                   max_lengths[i],
-                  max_new_tokens_[i]);
+                  max_new_tokens_[i],
+                  add_special_tokens_[i]);
     } else if (request_types[i] == RequestType::REQ_FINETUNING) {
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
       fine_tuning_req.max_length = max_lengths[i];
       fine_tuning_req.max_new_tokens = max_new_tokens_[i];
+      fine_tuning_req.add_special_tokens = add_special_tokens_[i];
       PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
       if (peft_model_id != nullptr) {
         fine_tuning_req.peft_model_id = *peft_model_id;
@@ -1725,12 +1729,13 @@ void flexflow_model_generate(flexflow_model_t handle_,
       fine_tuning_req.dataset_filepath = dataset_fp;
       fine_tuning_req.max_training_steps = training_steps[i];
       requests.push_back(fine_tuning_req);
-      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i",
+      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i %i",
                   i,
                   handle,
                   dataset_fp.c_str(),
                   max_lengths[i],
-                  max_new_tokens[i],
+                  max_new_tokens_[i],
+                  add_special_tokens_[i],
                   training_steps[i]);
     } else {
       assert(false && "Unknown request type");
@@ -2754,6 +2759,12 @@ void flexflow_request_manager_set_max_sequence_length(
   DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length);
 }
 
+int flexflow_request_manager_get_max_sequence_length(
+    flexflow_request_manager_t handle_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->get_max_sequence_length();
+}
+
 void flexflow_request_manager_set_enable_peft_finetuning(
     flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2766,14 +2777,19 @@ void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
     int bos_token_id,
-    int eos_token_id,
+    int num_eos_token_ids,
+    int *eos_token_ids,
     char const *tokenizer_filepath) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
   assert(tokenizer_filepath != nullptr &&
          "Cannot convert nullptr char * to std::string");
   std::string const tokenizer_filepath_str(tokenizer_filepath);
+  std::vector<int> eos_token_ids_vec;
+  for (int i = 0; i < num_eos_token_ids; i++) {
+    eos_token_ids_vec.push_back(eos_token_ids[i]);
+  }
   handle->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath_str);
+      model_type, bos_token_id, eos_token_ids_vec, tokenizer_filepath_str);
   DEBUG_PRINT(
       "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath);
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5fbee65e6d..193abbb455 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -56,6 +56,7 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
   os << "  peft_model_id: " << req.peft_model_id << "\n";
   os << "  max_length: " << req.max_length << "\n";
   os << "  max_new_tokens: " << req.max_new_tokens << "\n";
+  os << "  add_special_tokens: " << req.add_special_tokens << "\n";
   os << "  initial_len: " << req.initial_len << "\n";
   os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
   os << "  llm_cache_size: " << req.llm_cache_size << "\n";
@@ -178,11 +179,11 @@ void RequestManager::set_inference_finished(bool finished) {
 
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
-                                        int eos_token_id,
+                                        std::vector<int> eos_token_ids,
                                         std::string const &path) {
   this->model_type = type;
   this->bos_token_id = bos_token_id;
-  this->eos_token_id = eos_token_id;
+  this->eos_token_ids = eos_token_ids;
   std::filesystem::path tokenizer_folder(path);
 
   if (model_type == ModelType::LLAMA) {
@@ -271,6 +272,7 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_length = request_.max_length;
   request.max_new_tokens = request_.max_new_tokens;
+  request.add_special_tokens = request_.add_special_tokens;
   // both unset
   if (request.max_length == -1 && request.max_new_tokens == -1) {
     request.max_length = get_max_sequence_length() - 1;
@@ -285,7 +287,8 @@ RequestManager::RequestGuid
   }
   request.peft_model_id = request_.peft_model_id;
   request.warmup = request_.warmup;
-  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+  if (bos_token_id >= 0 && model_type != ModelType::FALCON &&
+      request.add_special_tokens) {
     request.tokens.push_back(bos_token_id);
   }
   if (request_.benchmarking_tokens >= 0) {
@@ -378,6 +381,7 @@ RequestManager::RequestGuid
   request.initial_len = 0;
   request.max_length = request_.max_length;
   request.max_new_tokens = request_.max_new_tokens;
+  request.add_special_tokens = request_.add_special_tokens;
   if (request.max_new_tokens != -1) {
     std::cerr
         << "Error: max_new_tokens is not allowed for PEFT finetuning requests"
@@ -402,7 +406,8 @@ RequestManager::RequestGuid
     request.benchmarking_tokens = request_.benchmarking_tokens;
     std::vector<int32_t> input_tokens;
     std::vector<int32_t> output_tokens;
-    bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON);
+    bool bos_added = (bos_token_id >= 0 && request.add_special_tokens &&
+                      model_type != ModelType::FALCON);
     if (bos_added) {
       input_tokens.push_back(bos_token_id);
     }
@@ -424,7 +429,8 @@ RequestManager::RequestGuid
       std::string output_text("");
       std::vector<int32_t> input_tokens;
       input_tokens = this->tokenizer_->Encode(text);
-      if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+      if (bos_token_id >= 0 && model_type != ModelType::FALCON &&
+          request.add_special_tokens) {
         input_tokens.insert(input_tokens.begin(), bos_token_id);
       }
       std::vector<int32_t> output_tokens =
@@ -557,6 +563,15 @@ BatchConfig RequestManager::prepare_next_batch_task(
   return rm->prepare_next_batch(*bc, result);
 }
 
+bool RequestManager::is_eos_token(int token_id) {
+  for (int eos_token : eos_token_ids) {
+    if (token_id == eos_token) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
                                               int i) {
   Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
@@ -564,7 +579,7 @@ bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
   // printf("model_type = %d\n", this->model_type);
   if (request.tokens.size() >= old_bc.requestsInfo[i].max_length) {
     request_completed = true;
-  } else if (request.tokens.back() == eos_token_id) {
+  } else if (is_eos_token(request.tokens.back())) {
     // Encounter EOS token id
     request_completed = true;
   }
@@ -673,6 +688,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
         if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+            request.add_special_tokens &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1134,6 +1150,7 @@ BeamSearchBatchConfig
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
         if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+            request.add_special_tokens &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1277,6 +1294,7 @@ BeamSearchBatchConfig
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
         if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+            request.add_special_tokens &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1325,7 +1343,7 @@ BeamSearchBatchConfig
       // Unlike Huggingface, the sentencepiece C++ library automatically removes
       // the BOS token
       if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
-          request.tokens.at(0) == bos_token_id) {
+          request.add_special_tokens && request.tokens.at(0) == bos_token_id) {
         output = "<s> " + output;
       }
       log_req_mgr.print("Output: %s", output.c_str());

From fc884fec852f1d3e3cadd6da1ce1b8e1b8630252 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 4 Nov 2024 19:24:56 +0000
Subject: [PATCH 38/44] fix inference test

---
 tests/peft/hf_utils.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py
index 94fb96f029..3760f05055 100644
--- a/tests/peft/hf_utils.py
+++ b/tests/peft/hf_utils.py
@@ -223,15 +223,15 @@ def save_lora_weights(self, model, pre_finetuning=False):
         if not pre_finetuning:
             self.step_count += 1
 
-    def on_step_end(
-        self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs
-    ):
-        self.save_lora_weights(model, pre_finetuning=False)
-
-    def on_step_begin(
-        self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs
-    ):
-        self.save_lora_weights(model, pre_finetuning=True)
+    def on_step_end(self, args, state, control, **kwargs):
+        model_ = kwargs.get("model", None)
+        assert model_ is not None
+        self.save_lora_weights(model_, pre_finetuning=False)
+
+    def on_step_begin(self, args, state, control, **kwargs):
+        model_ = kwargs.get("model", None)
+        assert model_ is not None
+        self.save_lora_weights(model_, pre_finetuning=True)
 
     def on_train_end(self, args, state, control, **kwargs):
         if verbose:

From 2047bdd26c1cf89854c2666edfcbbaa8a799175a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 6 Nov 2024 01:34:09 +0000
Subject: [PATCH 39/44] fix

---
 tests/fine_grained_alignment_test.sh        |  2 +-
 tests/inference/inference_alignment_test.py | 66 +++++++++++----------
 tests/peft/alignment/align_test_utils.py    |  4 +-
 3 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/tests/fine_grained_alignment_test.sh b/tests/fine_grained_alignment_test.sh
index 9ad26318f9..0ef1341951 100755
--- a/tests/fine_grained_alignment_test.sh
+++ b/tests/fine_grained_alignment_test.sh
@@ -11,7 +11,7 @@ CACHE_PATH=${FF_CACHE_PATH:-"~/.cache/flexflow"}
 NUM_STEPS=${NUM_STEPS:-2}
 
 cleanup() {
-    rm -rf "${CACHE_PATH}"/debug ./fine_grained_alignment_config.json ./inference/output/fine_grained_alignment_test_ff.txt ./inference/output/fine_grained_alignment_test_hf.txt
+    eval rm -rf "${CACHE_PATH}/debug" ./fine_grained_alignment_config.json ./inference/output/fine_grained_alignment_test_ff.txt ./inference/output/fine_grained_alignment_test_hf.txt
 }
 
 # Cd into directory holding this script
diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py
index 6fff4906f7..8dab7ff43b 100644
--- a/tests/inference/inference_alignment_test.py
+++ b/tests/inference/inference_alignment_test.py
@@ -17,7 +17,7 @@ def check_bwd_pass(self):
     def check_step(self, step_idx, learning_rate=0.001):
         raise NotImplementedError()
 
-class LllamaAlignmentTest(AlignmentTest):
+class LlamaAlignmentTest(AlignmentTest):
     def __init__(self, hf_config, tp_degree=1):
         self.hf_config = hf_config
         self.num_layers = self.hf_config.num_hidden_layers
@@ -168,7 +168,10 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp
                 ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',')
                 self.ff_batch_size = ff_tensor.shape[0]
 
-            ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            if "lm_head" in ff_tensor_path:
+                ff_shape = replace_value(ff_shape, 1, self.ff_batch_size)
+            else:
+                ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
             ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
             if self.tp_degree > 1:
                 # if replicate, check that they are identical
@@ -356,11 +359,14 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
         ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
         input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
         hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
-        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)[:,:,-1].squeeze()
+        hf_tensor = hf_tensor.squeeze()
+        print(hf_tensor.shape, ff_tensor.shape)
         compare(hf_tensor, ff_tensor, label="LM head input")
         output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
         hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
-        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)[:,:,-1].squeeze()
+        hf_tensor = hf_tensor.squeeze()
         compare(hf_tensor, ff_tensor, label="LM head output")
 
 class OPTAlignmentTest(AlignmentTest):
@@ -664,17 +670,17 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             assert torch.allclose(ff_qkv_tensor_out, ff_attn_tensor_in)
 
             # Compared scaled qproj
-            hf_tensor_name = f"layers.{i}.self_attn.scaled_qproj"
-            input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
-            output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
-            scaled_qproj_in = get_hf_tensor(hf_tensor_name, input_c)
-            scaled_qproj_out = get_hf_tensor(hf_tensor_name, output_c)
-            assert torch.allclose(scaled_qproj_in, scaled_qproj_out)
-            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.scaled_qkv_proj"
-            scaled_qkv_proj0 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0"), [64*6,3,9])
-            scaled_qkv_proj1 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0").replace("shard_0", "shard_1"), [64*6,3,9])
-            ff_scaled_qkv_proj = np.concatenate([scaled_qkv_proj0, scaled_qkv_proj1], axis=0)
-            ff_scaled_q_proj = torch.from_numpy(ff_scaled_qkv_proj[:, :1, :]).to(scaled_qproj_out.dtype)
+            # hf_tensor_name = f"layers.{i}.self_attn.scaled_qproj"
+            # input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # scaled_qproj_in = get_hf_tensor(hf_tensor_name, input_c)
+            # scaled_qproj_out = get_hf_tensor(hf_tensor_name, output_c)
+            # assert torch.allclose(scaled_qproj_in, scaled_qproj_out)
+            # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.scaled_qkv_proj"
+            # scaled_qkv_proj0 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0"), [64*6,3,9])
+            # scaled_qkv_proj1 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0").replace("shard_0", "shard_1"), [64*6,3,9])
+            # ff_scaled_qkv_proj = np.concatenate([scaled_qkv_proj0, scaled_qkv_proj1], axis=0)
+            # ff_scaled_q_proj = torch.from_numpy(ff_scaled_qkv_proj[:, :1, :]).to(scaled_qproj_out.dtype)
             # print("HF scaled qproj:")
             # print(scaled_qproj_out.squeeze().T)
             # print("FF scaled q proj:")
@@ -688,15 +694,15 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
 
 
             # check that out_proj input, attn_scores out and input are identical on the hf side
-            hf_tensor_name = f"layers.{i}.self_attn.attn_scores"
-            input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
-            output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
-            attn_scores_in = get_hf_tensor(hf_tensor_name, input_c)
-            attn_scores_out = get_hf_tensor(hf_tensor_name, output_c)
+            # hf_tensor_name = f"layers.{i}.self_attn.attn_scores"
+            # input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # attn_scores_in = get_hf_tensor(hf_tensor_name, input_c)
+            # attn_scores_out = get_hf_tensor(hf_tensor_name, output_c)
             hf_tensor_name = f"layers.{i}.self_attn.out_proj"
-            out_proj_in = get_hf_tensor(hf_tensor_name, input_c)
-            assert torch.allclose(attn_scores_in, attn_scores_out)
-            assert torch.allclose(attn_scores_in, out_proj_in)
+            # out_proj_in = get_hf_tensor(hf_tensor_name, input_c)
+            # assert torch.allclose(attn_scores_in, attn_scores_out)
+            # assert torch.allclose(attn_scores_in, out_proj_in)
 
             # Compare out proj input. This should be the output of the attention without any bias involved
             hf_tensor_name = f"layers.{i}.self_attn.out_proj"
@@ -707,12 +713,12 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name)
             compare(hf_tensor, ff_tensor, label=f"Attention o-proj {i} input")
             
-            hf_tensor_name = f"layers.{i}.self_attn.attn_scores"
-            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
-            output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
-            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
-            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
-            compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
+            # hf_tensor_name = f"layers.{i}.self_attn.attn_scores"
+            # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            # output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            # compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
 
             # hf_tensor_name = f"layers.{i}.final_layer_norm"
             # ff_tensor_name = f"layers.{i}.layers.{i}.add_bias_residual_layer_norm"
@@ -808,7 +814,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
     hf_config = AutoConfig.from_pretrained(args.model_name)
     alignment_class = None
     if hf_config.architectures[0] == "LlamaForCausalLM":
-        alignment_class = LllamaAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree)
+        alignment_class = LlamaAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree)
     elif hf_config.architectures[0] == "OPTForCausalLM":
         alignment_class = OPTAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree)
     
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index 3085bbda56..f5ed8ae65b 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -472,9 +472,9 @@ class TensorComparisonIdxs:
 def replace_value(lst, old_value, new_value):
     occurrences = lst.count(old_value)
     if occurrences == 0:
-        raise ValueError(f"Value {old_value} not found in the list.")
+        raise ValueError(f"Value {old_value} not found in the list: {lst}")
     elif occurrences > 1:
-        warnings.warn(f"Multiple instances of {old_value} found in the list.")
+        warnings.warn(f"Multiple instances of {old_value} found in the list: {lst}")
         occurrence_idx=0
         for i, value in enumerate(lst):
             if value == old_value:

From 2fd529d48579d0f7165f2965cc538107bcaf951f Mon Sep 17 00:00:00 2001
From: Pinku Surana <suranap@users.noreply.github.com>
Date: Fri, 8 Nov 2024 11:50:00 -0500
Subject: [PATCH 40/44] Add support for OFI conduit in GASNet (#1538)

GASNet's OFI conduit is used for the Slingshot network on Perlmutter
and Frontier. It takes an additional configuration, GASNet_SYSTEM,
configured for either slingshot10 or slingshot11.
---
 cmake/legion.cmake  | 1 +
 config/config.inc   | 7 +++++++
 config/config.linux | 3 +++
 3 files changed, 11 insertions(+)

diff --git a/cmake/legion.cmake b/cmake/legion.cmake
index 2afb507d3b..adcf5618f8 100644
--- a/cmake/legion.cmake
+++ b/cmake/legion.cmake
@@ -132,6 +132,7 @@ else()
 		  set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version")
 		  set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit")
 		  set(GASNet_CONDUIT ${FF_GASNET_CONDUIT})
+		  set(GASNet_SYSTEM ${FF_GASNET_SYSTEM})
 		elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx")
 		  set(ucx_ROOT ${UCX_PATH}/lib/cmake)
           message(STATUS "Find ucx: ${UCX_PATH}")
diff --git a/config/config.inc b/config/config.inc
index 011fe890fb..b4356bf078 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -118,6 +118,13 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
     SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
   elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
     SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
+  elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then
+    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi"
+    if [ "$FF_GASNET_SYSTEM" = "slingshot11" ]; then
+      SET_LEGION_NETWORKS+=" -DFF_GASNET_SYSTEM=slingshot11"
+    elif [ "$FF_GASNET_SYSTEM" = "slingshot10" ]; then
+      SET_LEGION_NETWORKS+=" -DFF_GASNET_SYSTEM=slingshot10"
+    fi
   fi
 elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
   SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
diff --git a/config/config.linux b/config/config.linux
index 09976cfa03..aae7901494 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -61,6 +61,9 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
 # select GASNET conduit
 FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
 
+# select GASNET system (usually with OFI conduit)
+FF_GASNET_SYSTEM=${FF_GASNET_SYSTEM:-slingshot11}
+
 # set UCX dir if Legion networks is set to ucx
 UCX_DIR=${UCX_DIR:-""}
 

From 1bef1a311d9d056acd207b26e0541a91d26125f7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 18 Nov 2024 20:00:41 +0000
Subject: [PATCH 41/44] update

---
 docker/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/run.sh b/docker/run.sh
index cdf9383052..46c63bab6f 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -120,4 +120,4 @@ if [ -f "$hf_token_path" ]; then
   hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
 fi
 
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"

From 7dcbd62f98061ea25938ecd8b4d13fbd3b8e638c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 18 Nov 2024 15:12:37 -0500
Subject: [PATCH 42/44] FlexLLM server demo (#1510)

* init

* update

* update

* update

* update

* add max new tokens parameter

* backup

* update

* backup

* lora configs serialize / deserialize into single file

* backup

* .

* .

* .

* .

* frontend

* bug fix

* fixes

* fix

* updates

* fix

* fix

* fix

* small fix

* fix

* fix reset input grad for non-activated loras

* fix

* update

* demo fixes & readme

* load weights in parallel

* cleanup

* cleanup

* load weights faster in inference test

* fix

* cleanup and fixes

* linting

* fix

* cleanup

* docker run update
---
 docker/flexflow-environment/Dockerfile        |   3 +-
 docker/run.sh                                 |  13 +-
 include/flexflow/batch_config.h               |   6 +-
 include/flexflow/config.h                     |   4 -
 include/flexflow/fftype.h                     |   1 +
 include/flexflow/flexflow_c.h                 |  11 +-
 include/flexflow/model.h                      |  11 +-
 include/flexflow/operator.h                   |   2 +-
 include/flexflow/ops/kernels/linear_kernels.h |   2 +
 .../ops/kernels/lora_linear_kernels.h         |  38 +-
 include/flexflow/ops/lora_linear.h            |  19 +-
 include/flexflow/ops/lora_linear_params.h     |  51 +-
 include/flexflow/request_manager.h            |  14 +
 include/flexflow/utils/file_loader.h          |  23 +-
 .../flexflow/utils/peft_weight_allocator.h    | 163 ++--
 inference/models/falcon.cc                    |   8 +
 inference/models/llama.cc                     |  11 +-
 inference/models/mpt.cc                       |   8 +
 inference/models/opt.cc                       |  10 +-
 inference/models/starcoder.cc                 |   7 +
 inference/peft/peft.cc                        |  17 +-
 inference/peft/peft_bwd_benchmark.cc          |   8 +-
 inference/peft/peft_fwd_benchmark.cc          |   8 +-
 inference/peft/req_rate_benchmark.cc          |   6 +-
 inference/python/chat.py                      |  23 +-
 inference/python/ff_peft.py                   |  51 +-
 inference/python/incr_decoding.py             |   1 -
 inference/python/peft_demo/INSTRUCTIONS.md    |   2 +-
 inference/python/peft_demo/demo.ipynb         |   6 +-
 inference/python/peft_demo/demo.py            |   5 +-
 inference/python/spec_infer.py                |   1 -
 inference/python/streamlit/README.md          |  18 +
 inference/python/streamlit/app.py             | 188 +++++
 inference/python/streamlit/fastapi_incr.py    | 207 +++++
 inference/utils/download_peft_model.py        |  32 +-
 python/flexflow/core/__init__.py              |   1 -
 python/flexflow/core/flexflow_cffi.py         |  18 +-
 python/flexflow/serve/__init__.py             |   9 -
 python/flexflow/serve/models/falcon.py        |   4 +
 python/flexflow/serve/models/llama.py         |   4 +
 python/flexflow/serve/models/mpt.py           |   4 +
 python/flexflow/serve/models/opt.py           |   4 +
 python/flexflow/serve/models/starcoder.py     |   4 +
 python/flexflow/serve/serve.py                | 424 ++++++-----
 src/c/flexflow_c.cc                           |  46 +-
 src/mapper/mapper.cc                          |   4 +
 src/ops/fused.cu                              |   3 +
 src/ops/kernels/linear_kernels.cu             |  45 ++
 src/ops/kernels/lora_linear_kernels.cu        | 291 +++----
 src/ops/linear.cc                             |   1 +
 src/ops/lora_linear.cc                        | 719 +++++-------------
 src/ops/lora_linear_params.cc                 | 147 +++-
 src/runtime/fftype.cc                         |   4 +
 src/runtime/file_loader.cc                    |  68 +-
 src/runtime/inference_manager.cc              |   4 +-
 src/runtime/model.cc                          |  90 +--
 src/runtime/model.cu                          |  31 +-
 src/runtime/peft_weight_allocator.cc          | 319 ++++++++
 src/runtime/peft_weight_allocator.cu          |  80 ++
 src/runtime/request_manager.cc                | 121 ++-
 .../inference/huggingface_inference_simple.py |  51 ++
 tests/inference/huggingface_pipeline.py       |  33 +
 tests/inference/inference_alignment_test.py   |   2 +-
 .../python_test_configs/generate_configs.py   |  12 +-
 tests/peft/alignment/align_test_utils.py      |   2 +-
 tests/peft/hf_finetune.py                     |   2 +-
 tests/peft/peft_alignment_test.py             |  73 +-
 tests/peft_test.sh                            |  12 +-
 68 files changed, 2326 insertions(+), 1284 deletions(-)
 create mode 100644 inference/python/streamlit/README.md
 create mode 100644 inference/python/streamlit/app.py
 create mode 100644 inference/python/streamlit/fastapi_incr.py
 create mode 100644 src/runtime/peft_weight_allocator.cc
 create mode 100644 src/runtime/peft_weight_allocator.cu
 create mode 100644 tests/inference/huggingface_inference_simple.py
 create mode 100644 tests/inference/huggingface_pipeline.py

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index d571befdda..2af81de11f 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -7,7 +7,7 @@ LABEL org.opencontainers.image.description="FlexFlow environment container"
 SHELL ["/bin/bash", "-c"]
 
 # Install basic dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \
+RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq openssh-client && \
     rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \
 	apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \
     apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \
@@ -125,6 +125,7 @@ RUN pip3 install transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
 # PEFT-related
 RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
+RUN pip3 install streamlit
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
diff --git a/docker/run.sh b/docker/run.sh
index 46c63bab6f..759da521aa 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -17,6 +17,11 @@ hip_version=${hip_version:-"empty"}
 ATTACH_GPUS=${ATTACH_GPUS:-true}
 gpu_arg=""
 if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
+FORWARD_STREAMLIT_PORT=${FORWARD_STREAMLIT_PORT:-true}
+port_forward_arg=""
+if $FORWARD_STREAMLIT_PORT ; then
+  port_forward_arg+="-p 8501:8501"
+fi
 
 
 # Amount of shared memory to give the Docker container access to
@@ -120,4 +125,10 @@ if [ -f "$hf_token_path" ]; then
   hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
 fi
 
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
+ssh_key_volume=""
+ssh_key_path="$HOME/.ssh/id_rsa"
+if [ -f "$ssh_key_path" ]; then
+  # If the token exists, add the volume mount to the Docker command
+  ssh_key_volume+="-v $ssh_key_path:/root/.ssh/id_rsa"
+fi
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${ssh_key_volume}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index a509af765c..bb8b4c67f6 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -20,6 +20,7 @@
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
+#include <cstring>
 
 // #define MAX_SEQ_LEN 1024
 // #define BATCH_SIZE 2
@@ -74,6 +75,7 @@ class BatchConfig {
   static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
+  static int const MAX_PEFT_CONFIG_SIZE = 1024;
 
   //  Set by update
 
@@ -89,11 +91,12 @@ class BatchConfig {
       num_tokens_in_batch = 0;
       max_length = 0;
       request_guid = 0;
+      peft_model_id = PEFTModelID::NO_ID;
       prompt_phase = false;
       batch_config_request_id = -1;
-      peft_model_id = PEFTModelID::NO_ID;
       peft_bwd = false;
       optimizer_tasks = {true, false, false, false};
+      std::memset(peft_model_config_str, 0, MAX_PEFT_CONFIG_SIZE);
     }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
@@ -106,6 +109,7 @@ class BatchConfig {
     RequestGuid request_guid;
     // PEFT fields
     PEFTModelID peft_model_id;
+    char peft_model_config_str[MAX_PEFT_CONFIG_SIZE];
     bool peft_bwd;
     OptimizerTasks optimizer_tasks;
   };
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index dd9d657117..37afa0df27 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -104,8 +104,6 @@ struct FFHandler {
   // PEFT related fields
   MemoryAllocator *peft_activation_allocator;
   size_t peft_activation_reserve_space_size;
-  PEFTWeightAllocator *peft_weight_allocator;
-  size_t peft_weight_reserve_space_size;
   // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
@@ -118,7 +116,6 @@ struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
   size_t peft_activation_reserve_space_size;
-  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -179,7 +176,6 @@ class FFConfig {
   // PEFT related fields
   bool enable_peft;
   size_t peft_activation_reserve_space_size;
-  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 3e482b8d67..ebc811c262 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -27,6 +27,7 @@ class PEFTModelID {
   PEFTModelID(size_t id);
   bool is_valid_id() const;
   friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   PEFTModelID const &peft_model_id);
 
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 6501b0658c..677f9915cd 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -91,6 +91,8 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
 
 int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);
 
+bool flexflow_config_get_enable_peft(flexflow_config_t handle_);
+
 void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
                                                  int value);
 
@@ -622,7 +624,11 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
-flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+void flexflow_model_add_lora_layers(flexflow_model_t handle_,
+                                    int num_target_modules,
+                                    char const **target_modules_);
+
+flexflow_peft_model_id_t flexflow_model_register_peft_adapter(
     flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
 
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
@@ -1023,6 +1029,9 @@ void flexflow_request_manager_set_max_sequence_length(
 int flexflow_request_manager_get_max_sequence_length(
     flexflow_request_manager_t handle_);
 
+void flexflow_request_manager_set_max_concurrent_adapters(
+    flexflow_request_manager_t handle_, int max_concurrent_adapters);
+
 void flexflow_request_manager_set_enable_peft_finetuning(
     flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 51b7950db8..e352159af0 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -278,6 +278,7 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
+  LOAD_WEIGHT_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
@@ -835,7 +836,9 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-  PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+  //   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+  void add_lora_layers(std::vector<std::string> target_modules);
+  PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
   // ========================================
   // Inference APIs
   // ========================================
@@ -1170,9 +1173,9 @@ class FFModel {
   std::vector<ParallelTensor> parameters;
   // PEFT related
   std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
-  std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
-  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
-  //   std::vector<Op *> peft_operators;
+  //   std::unordered_map<Layer *, std::vector<PEFTModelID>>
+  //   peft_layer_to_peft_id; std::unordered_map<PEFTModelID, LoraLinearConfig>
+  //   peft_configs; std::vector<Op *> peft_operators;
 
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 007314797a..c108740ef3 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -280,7 +280,7 @@ class Op {
     // get operator name and print it
     std::string op_name_without_uid = get_op_name_without_uid(m);
     std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
-              << std::endl;
+              << (before_kernel ? " (before kernel)" : "") << std::endl;
     // build the path to save the tensor
     fs::path dst_filepath;
     if (fwd_pass) {
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index 90e50a0c9a..aaa845db23 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -61,6 +61,7 @@ void inference_kernel_wrapper(LinearMeta *m,
                               int out_dim,
                               int batch_size);
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             BatchConfig const *bc,
                              void *input_grad_ptr,
                              void *output_grad_ptr,
                              void const *kernel_ptr,
@@ -94,6 +95,7 @@ void forward_kernel(LinearMeta const *m,
                     ffStream_t stream);
 template <typename DT>
 void peft_bwd_kernel(LinearMeta const *m,
+                     BatchConfig const *bc,
                      void *input_grad_ptr,
                      void *output_grad_ptr,
                      void const *kernel_ptr,
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index eee9875d30..fd86dc68c0 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -6,43 +6,27 @@
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/ops/lora_linear.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
+
 using Legion::Context;
 using Legion::Runtime;
-struct LoraLinearWeight {
-  // weights
-  void *w0_ptr, *w1_ptr;
-  // gradients
-  void *w0_grad_ptr, *w1_grad_ptr;
-  // v values for SGD optimizer (when using momentum)
-  void *w0_v_values_ptr, *w1_v_values_ptr;
-  int in_dim, out_dim, rank, num_shards;
-};
-
-struct LoraLinearModelState {
-  LoraLinearWeight weights;
-  LoraOptimizerConfig const *optimizer_config;
-  float lora_alpha;
-  std::string cache_folder;
-  // Huggingface model ID (for download and/or upload)
-  std::string peft_model_id;
-};
 
 class LoraLinearMeta : public OpMeta {
 public:
   LoraLinearMeta(FFHandler handle, LoraLinear const *li);
   ~LoraLinearMeta(void);
-  // PEFT related fields
-  void *low_rank_activation;
-  void *input_activation;
-  std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
-  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
+  PEFTMemoryManager *peft_memory_manager;
 };
 
 namespace Kernels {
 namespace LoraLinear {
-void init_kernel_wrapper(LoraLinearMeta *m, int seed);
+
+bool lora_applies_to_this_layer(LoraLinearMeta *m,
+                                LoraLinearConfig const &config);
+
+// void init_kernel_wrapper(LoraLinearMeta *m, int seed);
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
@@ -51,12 +35,13 @@ void peft_bwd_kernel_wrapper(Context ctx,
                              Runtime *runtime,
                              LoraLinearMeta *m,
                              BatchConfig const *bc,
+                             int shard_id,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad);
 
 namespace Internal {
-template <typename DT>
-void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
+// template <typename DT>
+// void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -70,6 +55,7 @@ void peft_bwd_kernel(Context ctx,
                      Runtime *runtime,
                      LoraLinearMeta *m,
                      BatchConfig const *bc,
+                     int shard_id,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      int in_dim,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 9e83c3f90e..cc625cafc2 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -17,14 +17,13 @@ class LoraLinear : public Op {
   using Params = LoraLinearParams;
   using Input = std::pair<ParallelTensor, ParallelTensor>;
 
-  LoraLinear(
-      FFModel &model,
-      LayerID const &layer_guid,
-      OperatorType type,
-      ParallelTensor const input,
-      ParallelTensor const output,
-      std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
-      char const *name = nullptr);
+  LoraLinear(FFModel &model,
+             LayerID const &layer_guid,
+             ParallelTensor const input,
+             ParallelTensor const output,
+             int max_rank,
+             int max_concurrent_adapters,
+             char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
              ParallelTensor const input,
@@ -91,7 +90,9 @@ class LoraLinear : public Op {
   // size_t get_params_hash() const override;
   LoraLinearParams get_params() const;
 
-  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  int max_rank;
+  int max_concurrent_adapters;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 70539271f2..46b88c9690 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -17,6 +17,9 @@ namespace FlexFlow {
 class LoraOptimizerConfig {
 public:
   LoraOptimizerConfig();
+  virtual std::string getType() const = 0;
+  virtual nlohmann::json toJson() const = 0;
+  static LoraOptimizerConfig *fromJson(nlohmann::json const &j);
   virtual ~LoraOptimizerConfig() {}
 };
 
@@ -29,9 +32,11 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
                          bool weight_decay_ = 0.0f);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraSGDOptimizerConfig const &llc);
-
-  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
-      LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay)
+  std::string getType() const override {
+    return "SGD";
+  }
+  nlohmann::json toJson() const override;
+  static LoraSGDOptimizerConfig *fromJson(nlohmann::json const &j);
 
 public:
   double lr = 0.001f;
@@ -51,8 +56,11 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraAdamOptimizerConfig const &llc);
 
-  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
-      LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon)
+  std::string getType() const override {
+    return "Adam";
+  }
+  nlohmann::json toJson() const override;
+  static LoraAdamOptimizerConfig *fromJson(nlohmann::json const &j);
 
 public:
   // Adam
@@ -63,14 +71,6 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
   double epsilon = 1e-8;
 };
 
-// Serialization helpers
-template <typename T>
-void serialize_to_json_file(T const &obj, fs::path const &filepath);
-
-// Function to deserialize JSON from file and create object
-template <typename T>
-std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath);
-
 class LoraLinearConfig {
 public:
   static const LoraLinearConfig EmptyConfig;
@@ -92,17 +92,14 @@ class LoraLinearConfig {
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
 
-  NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig,
-                                 cache_folder,
-                                 peft_model_id,
-                                 rank,
-                                 lora_alpha,
-                                 lora_dropout,
-                                 target_modules,
-                                 trainable,
-                                 init_lora_weights,
-                                 base_model_name_or_path,
-                                 precision)
+  std::string serialize_to_json_string(int indent = -1) const;
+  void serialize_to_json_file(std::string const &filename) const;
+  // Deserialization method
+  static LoraLinearConfig
+      deserialize_from_json_string(std::string const &json_string);
+  // Deserialization method
+  static LoraLinearConfig
+      deserialize_from_json_file(std::string const &filename);
 
   std::string cache_folder;
   // Huggingface model ID (for download and/or upload)
@@ -128,8 +125,8 @@ class LoraLinearConfig {
 class LoraLinearParams {
 public:
   LayerID layer_guid;
-  OperatorType type;
-  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  int max_rank;
+  int max_concurrent_adapters;
   char name[MAX_OPNAME];
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
@@ -147,4 +144,4 @@ struct hash<FlexFlow::LoraLinearParams> {
 };
 } // namespace std
 
-#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H
+#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H
\ No newline at end of file
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d62b610f3d..c15c0ff8b4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -150,6 +150,13 @@ class RequestManager {
                           std::vector<int> eos_token_ids,
                           std::string const &path);
   void register_output_filepath(std::string const &);
+  void set_peft_config(PEFTModelID const &peft_model_id,
+                       LoraLinearConfig const &peft_config);
+  LoraLinearConfig const &get_peft_config(PEFTModelID const &peft_model_id);
+  void set_max_lora_rank(int max_lora_rank);
+  void set_max_concurrent_adapters(int max_concurrent_adapters);
+  int get_max_lora_rank();
+  int get_max_concurrent_adapters();
   void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
   void appendBitMask(BatchConfig::BitMask &bitmask,
@@ -182,6 +189,9 @@ class RequestManager {
   bool is_eos_token(int token_id);
   bool check_inf_req_completion(BatchConfig const &old_bc, int i);
   void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
+  void add_peft_config_to_request_info(BatchConfig &bc,
+                                       int req_idx,
+                                       LoraLinearConfig const &peft_config);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
@@ -291,6 +301,10 @@ class RequestManager {
   int max_sequence_length;
   Status request_manager_status;
 
+  // peft
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  int max_lora_rank = 32;
+  int max_concurrent_adapters = 0;
   // peft benchmarking
   bool enable_peft_finetuning = false;
   static bool inference_finished;
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 646eb18da2..8735f23571 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -39,7 +39,13 @@ class FileDataLoader {
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
   void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
-  void load_weights(FFModel *ff);
+
+  static void
+      load_weight_task(Legion::Task const *task,
+                       std::vector<Legion::PhysicalRegion> const &regions,
+                       Legion::Context ctx,
+                       Legion::Runtime *runtime);
+  void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -54,3 +60,18 @@ class FileDataLoader {
   std::string weights_folder;
   bool use_full_precision;
 };
+
+struct WeightLoadTaskArgs {
+  FFModel *ff;
+  FileDataLoader *loader;
+  Layer *layer;
+  int weight_idx;
+  DataType data_type;
+  WeightLoadTaskArgs(FFModel *_ff,
+                     FileDataLoader *_loader,
+                     Layer *_l,
+                     int _idx,
+                     DataType _data_type)
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx),
+        data_type(_data_type) {}
+};
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index dae46a8af1..21ac9bf426 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -17,76 +17,121 @@
 #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
 
 #include "flexflow/config.h"
-#include <mutex>
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/lora_linear_params.h"
+// #include <mutex>
 
 namespace FlexFlow {
 
-class PEFTWeightAllocator {
-public:
-  PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
-      : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0),
-        local_offset(_total_size) {}
+struct LoraLinearWeight {
+  // weights
+  void *w0_ptr, *w1_ptr;
+  // gradients
+  void *w0_grad_ptr, *w1_grad_ptr;
+  // activations
+  void *input_activation;
+  void *low_rank_activation;
+  // v values for SGD optimizer (when using momentum)
+  void *w0_v_values_ptr, *w1_v_values_ptr;
+  LoraLinearWeight(void *w0 = nullptr,
+                   void *w1 = nullptr,
+                   void *w0_grad = nullptr,
+                   void *w1_grad = nullptr,
+                   void *w0_v_values = nullptr,
+                   void *w1_v_values = nullptr,
+                   void *low_rank_activation_ = nullptr,
+                   void *input_activation_ = nullptr)
+      : w0_ptr(w0), w1_ptr(w1), w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad),
+        w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values),
+        low_rank_activation(low_rank_activation_),
+        input_activation(input_activation_) {}
+};
 
-  inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id,
-                                             size_t datalen) {
-    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
-    void *ptr = static_cast<char *>(base_ptr) + sync_offset;
-    off_t model_sync_weights_offset = sync_offset;
-    size_t model_sync_weights_size = datalen;
-    if (sync_weights.find(peft_model_id) != sync_weights.end()) {
-      // Assert that sync weights for each PEFT model is consecutive
-      std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
-      assert(sync_offset == offset_and_size.first + offset_and_size.second);
-      model_sync_weights_offset = offset_and_size.first;
-      model_sync_weights_size = offset_and_size.second + datalen;
-    }
-    sync_offset += datalen;
-    assert(sync_offset < local_offset);
-    sync_weights[peft_model_id] =
-        std::make_pair(model_sync_weights_offset, model_sync_weights_size);
-    return ptr;
-  }
+void init_peft_weight_wrapper(LoraLinearWeight const &weight,
+                              int in_dim,
+                              int out_dim,
+                              int rank,
+                              DataType dt,
+                              int seed);
 
-  std::pair<void *, size_t>
-      get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) {
-    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
-    assert(sync_weights.find(peft_model_id) != sync_weights.end());
-    std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
-    return std::make_pair(static_cast<char *>(base_ptr) + offset_and_size.first,
-                          offset_and_size.second);
+class PEFTMemoryManager {
+public:
+  PEFTMemoryManager(Legion::Memory gpu_mem_,
+                    int max_rank_,
+                    int max_concurrent_adapters_,
+                    int max_peft_tokens_,
+                    int in_dim_,
+                    int out_dim_,
+                    int num_shards_,
+                    int shard_id_,
+                    std::string const &lora_layername_substr_,
+                    DataType dt_)
+      : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_),
+        max_rank(max_rank_), in_dim(in_dim_), out_dim(out_dim_),
+        num_shards(num_shards_), shard_id(shard_id_),
+        max_peft_tokens(max_peft_tokens_),
+        lora_layername_substr(lora_layername_substr_), dt(dt_),
+        base_ptr(nullptr), finetuning_ptr(nullptr),
+        finetuning_model_id(PEFTModelID::NO_ID) {
+    max_lora_size =
+        data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
+    assert(max_concurrent_adapters > 0 &&
+           "PEFT Memory Manager max_concurrent_adapters must be > 0");
+    assert(max_lora_size > 0 &&
+           "PEFT Memory Manager max_lora_size must be > 0");
+    allocate_inference_memory();
+    // finetuning memory is allocated upon the first finetuning request, so we
+    // can skip for inference-only workloads
   }
 
-  inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id,
-                                              size_t datalen) {
-    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
-    local_offset -= datalen;
-    assert(sync_offset < local_offset);
-    void *ptr = static_cast<char *>(base_ptr) + local_offset;
-    return ptr;
-  }
+  // allocate memory for all the PEFT adapters for a given layer on a given
+  // shard
+  void allocate_inference_memory();
+  // allocate memory for the PEFT adapter for a finetuning request for a given
+  // layer and shard
+  void allocate_finetuning_memory();
 
-  template <typename DT>
-  inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id,
-                                   size_t count) {
-    return static_cast<DT *>(
-        allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count));
-  }
+  LoraLinearWeight get_peft(PEFTModelID const &model_id,
+                            LoraLinearConfig const &lora_config);
+  void check_ft_model_id(PEFTModelID const &model_id);
 
-  template <typename DT>
-  inline DT *allocate_local_weights(PEFTModelID const &peft_model_id,
-                                    size_t count) {
-    return static_cast<DT *>(
-        allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count));
-  }
+private:
+  // Check if the PEFT adapter for the given model is in memory. If not, sets
+  // the cache_miss flag to true. If this is the first finetuning request,
+  // allocate memory for the finetuning adapter.
+  void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss);
+  // Returns the slot in memory where the peft model weights are/will be stored.
+  // If the model is not in memory (cache miss), set the cache_miss flag to
+  // true.
+  int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss);
+  void load_peft_model(LoraLinearWeight &weight,
+                       LoraLinearConfig const &lora_config);
+  LoraLinearWeight get_inference_peft(PEFTModelID const &model_id,
+                                      LoraLinearConfig const &lora_config);
+  LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id,
+                                       LoraLinearConfig const &lora_config);
 
-public:
-  void *base_ptr;
-  size_t total_size;
-  off_t sync_offset, local_offset;
-  std::unordered_map<PEFTModelID, std::pair<off_t, size_t>> sync_weights;
-  std::mutex peft_weight_allocator_mutex;
+  // Legion memory management apparatus
+  Legion::Memory gpu_mem;
+  Realm::RegionInstance peftLegionInst;
+  void *base_ptr, *finetuning_ptr;
+  // Size and shapes
+  int max_concurrent_adapters;
+  int max_rank;
+  int max_lora_size;
+  int in_dim, out_dim, num_shards, shard_id;
+  int max_peft_tokens;
+  // LRU cache apparatus
+  std::unordered_map<PEFTModelID, int> lru_hashtable;
+  std::vector<PEFTModelID>
+      lru_list; // head = least recently used, tail=most recently used
+  std::unordered_map<PEFTModelID, int> peft2mem_slot;
+  // Miscellanea
+  std::string lora_layername_substr;
+  DataType dt;
+  PEFTModelID finetuning_model_id;
 };
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
 
 #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index fd4da87b99..b4f961b006 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -269,6 +269,14 @@ void FALCON::create_falcon_model(FFModel &ff,
     output = ff.argmax(lm_head, /*beam_Search*/ false);
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"dense_h_to_4h",
+                                               "dense_4h_to_h"};
+    ff.add_lora_layers(target_modules);
+  }
+
   FileDataLoader *fileloader =
       new FileDataLoader("",
                          weight_file_path,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index bd5243bd4b..7b4a14b472 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -250,9 +250,6 @@ void LLAMA::create_llama_model(FFModel &ff,
         REG_MODE_NONE,
         0.0f,
         std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
-    // Low-Rank Adapter (LoRA) for the second linear layer
-    // ff.lora_linear(std::string("down_proj"), std::string("layers." +
-    // std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -297,6 +294,14 @@ void LLAMA::create_llama_model(FFModel &ff,
     }
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {
+        "gate_proj", "up_proj", "down_proj"};
+    ff.add_lora_layers(target_modules);
+  }
+
   FileDataLoader *fileloader = new FileDataLoader(
       "",
       weight_file_path,
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index d02c0f3b82..6807266ef4 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -272,6 +272,14 @@ void MPT::create_mpt_model(FFModel &ff,
   } else {
     output = ff.argmax(lm_head, /*beam_Search*/ false);
   }
+
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"up_proj", "down_proj"};
+    ff.add_lora_layers(target_modules);
+  }
+
   FileDataLoader *fileloader =
       new FileDataLoader("",
                          weight_file_path,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 34a6bb0f02..cb3d5290cf 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -243,9 +243,6 @@ void OPT::create_opt_model(FFModel &ff,
                    REG_MODE_NONE,
                    0.0f,
                    std::string("layers." + std::to_string(i) + ".fc2").c_str());
-    // Low-Rank Adapter (LoRA) for the second linear layer
-    // ff.lora_linear(std::string("fc2"), std::string("layers." +
-    // std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
@@ -286,6 +283,13 @@ void OPT::create_opt_model(FFModel &ff,
     output = ff.argmax(softmax, /*beam_Search*/ false);
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"fc1", "fc2"};
+    ff.add_lora_layers(target_modules);
+  }
+
   FileDataLoader *fileloader = new FileDataLoader(
       "",
       weight_file_path,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 2429b1ec1b..3dd61be983 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -253,6 +253,13 @@ void STARCODER::create_starcoder_model(
     }
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"c_fc", "c_proj"};
+    ff.add_lora_layers(target_modules);
+  }
+
   InferenceManager *im = InferenceManager::get_inference_manager();
   FileDataLoader *fileloader = new FileDataLoader(
       "",
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 0ab0b62ee8..4f2d47055a 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -256,7 +256,7 @@ void FlexFlow::top_level_task(Task const *task,
   LoraOptimizerConfig *optim_config = nullptr;
   if (enable_peft_finetuning) {
     // float sgd_learning_rate = 2e-1;
-    float sgd_learning_rate = 1.0f;
+    float sgd_learning_rate = 0.001f;
     optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate);
   }
   LoraLinearConfig peft_config_finetuning =
@@ -275,6 +275,8 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(
       max_requests_per_batch +
       (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_concurrent_adapters(max_requests_per_batch +
+                                  (int)enable_peft_finetuning);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
@@ -320,18 +322,19 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
-  // Add PEFT layer
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Add PEFT adapter(s)
   PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
     if (enable_peft_finetuning) {
-      peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning);
+      peft_model_id_finetuning =
+          model.register_peft_adapter(peft_config_finetuning);
     }
   }
 
-  // Start background server
-  rm->start_background_server(&model);
-
   // Run workload
   {
     std::vector<Request> requests;
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index 85e97ec4e8..9da4fa1994 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  // Start background server
+  rm->start_background_server(&model);
+
   // Add PEFT layer
   PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
   }
 
-  // Start background server
-  rm->start_background_server(&model);
-
   // Warmup stage
   {
     std::vector<Request> requests;
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
index 87322a42dd..3274f2e535 100644
--- a/inference/peft/peft_fwd_benchmark.cc
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  // Start background server
+  rm->start_background_server(&model);
+
   // Add PEFT layer
   PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
   }
 
-  // Start background server
-  rm->start_background_server(&model);
-
   // Run workload
   {
     std::vector<Request> requests;
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index ffa77478e1..8a94f6e68b 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -366,14 +366,14 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  rm->start_background_server(&model);
+
   // Add PEFT layer
   PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
   }
 
-  rm->start_background_server(&model);
-
   // Warmup stage
   {
     std::vector<Request> requests;
diff --git a/inference/python/chat.py b/inference/python/chat.py
index 13ece116a6..95132443a2 100644
--- a/inference/python/chat.py
+++ b/inference/python/chat.py
@@ -21,14 +21,14 @@ def get_configs():
     # Define sample configs
     ff_init_configs = {
         # required parameters
-        "num_gpus": 1,
-        "memory_per_gpu": 30000,
-        "zero_copy_memory_per_node": 60000,
+        "num_gpus": 8,
+        "memory_per_gpu": 34000,
+        "zero_copy_memory_per_node": 200000,
         # optional parameters
-        "num_cpus": 4,
-        "legion_utility_processors": 4,
+        "num_cpus": 16,
+        "legion_utility_processors": 16,
         "data_parallelism_degree": 1,
-        "tensor_parallelism_degree": 1,
+        "tensor_parallelism_degree": 8,
         "pipeline_parallelism_degree": 1,
         "offload": False,
         "offload_reserve_space_size": 8 * 1024,  # 8GB
@@ -36,7 +36,6 @@ def get_configs():
         "use_8bit_quantization": False,
         "enable_peft": False,
         "peft_activation_reserve_space_size": 1024,  # 1GB
-        "peft_weight_reserve_space_size": 1024,  # 1GB
         "profiling": False,
         "benchmarking": False,
         "inference_debugging": False,
@@ -44,7 +43,7 @@ def get_configs():
     }
     llm_configs = {
         # required parameters
-        "llm_model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
         # optional parameters
         "cache_path": os.environ.get("FF_CACHE_PATH", ""),
         "refresh_cache": False,
@@ -86,11 +85,15 @@ def main():
 
     llm.start_server()
 
+    nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
+    llama_generic_system = "You are a helpful an honest programming assistant."
+
+
     messages=[
-        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "system", "content": nemotron_system},
         {"role": "user", "content": "Is Rust better than Python?"},
     ]
-    llm.generate(messages, max_new_tokens=256)
+    llm.generate(messages, max_new_tokens=1024)
     
     llm.stop_server()
 
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 13da7aee20..0167cecebc 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -41,14 +41,14 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 2,
+            "num_gpus": 4,
             "memory_per_gpu": 14000,
             "zero_copy_memory_per_node": 10000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
             "data_parallelism_degree": 1,
-            "tensor_parallelism_degree": 2,
+            "tensor_parallelism_degree": 4,
             "pipeline_parallelism_degree": 1,
             "offload": False,
             "offload_reserve_space_size": 8 * 1024,  # 8GB
@@ -56,7 +56,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": True,
             "peft_activation_reserve_space_size": 1024,  # 1GB
-            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "inference_debugging": True,
             "fusion": False,
@@ -103,6 +102,23 @@ def main():
         refresh_cache=configs.refresh_cache,
         output_file=configs.output_file,
     )
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
+        max_seq_length=256,
+        max_tokens_per_batch=128,
+        max_concurrent_adapters=1 if not enable_peft_finetuning else 2,
+        enable_peft_finetuning=enable_peft_finetuning,
+    )
+
+    llm.start_server()
+
     # Add inference and/or finetuning lora
     lora_inference_config = None
     lora_finetuning_config = None
@@ -112,18 +128,8 @@ def main():
             configs.inference_peft_model_id,
             base_model_name_or_path=configs.base_model,
         )
-        llm.add_peft(lora_inference_config)
+        llm.register_peft_adapter(lora_inference_config)
     if len(configs.finetuning_dataset) > 0:
-        # lora_finetuning_config = ff.LoraLinearConfig(
-        #     llm.cache_path,
-        #     configs.finetuning_peft_model_id,
-        #     target_modules=["down_proj"],
-        #     rank=16,
-        #     lora_alpha=16,
-        #     trainable=True,
-        #     init_lora_weights=True,
-        #     optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
-        # )
         lora_finetuning_config = ff.LoraLinearConfig(
             llm.cache_path,
             configs.inference_peft_model_id,
@@ -137,22 +143,7 @@ def main():
                 "nesterov": False,
             },
         )
-        llm.add_peft(lora_finetuning_config)
-
-    # Compile the LLM for inference and load the weights into memory
-    generation_config = ff.GenerationConfig(
-        do_sample=False, temperature=0.9, topp=0.8, topk=1
-    )
-    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
-    llm.compile(
-        generation_config,
-        enable_peft_finetuning=enable_peft_finetuning,
-        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
-        max_seq_length=256,
-        max_tokens_per_batch=128,
-    )
-
-    llm.start_server()
+        llm.register_peft_adapter(lora_finetuning_config)
 
     requests = []
     # Serving
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 232ef1699c..4bb6892a6b 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -56,7 +56,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": False,
             "peft_activation_reserve_space_size": 1024,  # 1GB
-            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md
index 9b2a7a53b2..0f78efdea9 100644
--- a/inference/python/peft_demo/INSTRUCTIONS.md
+++ b/inference/python/peft_demo/INSTRUCTIONS.md
@@ -13,7 +13,7 @@
 
     * `export HUGGINGFACE_TOKEN="[Your token]"`
     * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"`
-    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"`
+    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full"`
 
 * Run the demo
     ```
diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb
index dfb5193a1d..ea2b8417b6 100644
--- a/inference/python/peft_demo/demo.ipynb
+++ b/inference/python/peft_demo/demo.ipynb
@@ -91,7 +91,6 @@
     "    \"use_8bit_quantization\": False,\n",
     "    \"enable_peft\": True,\n",
     "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
-    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
     "    \"profiling\": False,\n",
     "    \"inference_debugging\": False,\n",
     "    \"fusion\": False,\n",
@@ -195,7 +194,7 @@
     }
    ],
    "source": [
-    "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n",
+    "args = [configs.inference_peft_model_id]\n",
     "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)"
    ]
   },
@@ -1773,7 +1772,6 @@
     "    \"use_8bit_quantization\": False,\n",
     "    \"enable_peft\": True,\n",
     "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
-    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
     "    \"profiling\": False,\n",
     "    \"inference_debugging\": False,\n",
     "    \"fusion\": False,\n",
@@ -1815,7 +1813,7 @@
     "configs = SimpleNamespace(**configs_dict)\n",
     "\n",
     "\n",
-    "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n",
+    "args = [configs.finetuning_peft_model_id+\"-dolly\"]\n",
     "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n",
     "\n",
     "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n",
diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
index 9e01b4645b..b70f3c8966 100644
--- a/inference/python/peft_demo/demo.py
+++ b/inference/python/peft_demo/demo.py
@@ -47,7 +47,6 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data
     "use_8bit_quantization": False,
     "enable_peft": True,
     "peft_activation_reserve_space_size": 1024,  # 1GB
-    "peft_weight_reserve_space_size": 1024,  # 1GB
     "profiling": False,
     "inference_debugging": False,
     "fusion": False,
@@ -99,7 +98,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data
     file.write('')
 
 # Download base and peft inference models
-args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]
+args = [configs.inference_peft_model_id]
 # hf_token = input("Please enter your HuggingFace personal access token: ")
 # subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
 subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
@@ -207,7 +206,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data
 )
 llm.add_peft(lora_inference_config)
 
-args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model]
+args = [configs.finetuning_peft_model_id]
 #hf_token = input("Please enter your HuggingFace personal access token: ")
 # subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
 # subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index 7ae752cffc..8cf96c1eba 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -56,7 +56,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": False,
             "peft_activation_reserve_space_size": 1024,  # 1GB
-            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md
new file mode 100644
index 0000000000..86a15e2d6d
--- /dev/null
+++ b/inference/python/streamlit/README.md
@@ -0,0 +1,18 @@
+# Streamlit demo
+
+## Instructions
+
+1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder
+2. Edit the FlexFlow/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc)
+3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes
+```
+cd FlexFlow/inference/python/streamlit
+python fastapi_incr.py
+```
+4. In another terminal, launch the streamlit app:
+```
+cd FlexFlow/inference/python/streamlit
+streamlit run app.py 
+```
+5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser
+
diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py
new file mode 100644
index 0000000000..9788765a3a
--- /dev/null
+++ b/inference/python/streamlit/app.py
@@ -0,0 +1,188 @@
+import streamlit as st
+import requests
+import os, json
+from huggingface_hub import model_info
+
+
+# App title
+st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide")
+
+# FastAPI server URL
+FASTAPI_URL = "http://localhost:8000/chat/completions"  # Adjust the port if necessary
+FINETUNE_URL = "http://localhost:8000/finetuning"
+
+# Initialize session state variables
+if 'added_adapters' not in st.session_state:
+    st.session_state.added_adapters = []
+
+# Store LLM generated responses
+if "messages" not in st.session_state.keys():
+    st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
+
+def check_model_availability(model_name):
+    try:
+        info = model_info(model_name)
+        return True
+    except Exception:
+        return False
+
+def clear_chat_history():
+    st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
+
+# Function for generating LLaMA2 response
+def generate_llama3_response(prompt_input):
+    system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
+    
+    # Send request to FastAPI server
+    response = requests.post(FASTAPI_URL, json={"max_new_tokens": 1024, "messages": [{"role": "system", "content": system_prompt}] + st.session_state.messages + [{"role": "user", "content": prompt_input}]})
+    
+    if response.status_code == 200:
+        return response.json()["response"]
+    else:
+        return f"Error: {response.status_code} - {response.text}"
+
+# Sidebar
+with st.sidebar:
+    st.title('🚀 FlexLLM Server')
+    page = st.radio("Choose a page", ["Chat", "Finetune"])
+    if page == "Chat":
+        st.header('🦙 Llama Chatbot')
+        # st.success('Using local FastAPI server', icon='✅')
+        st.sidebar.button('Clear Chat History', on_click=clear_chat_history)
+
+        st.subheader('Generation parameters')
+        max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=2048, value=1024, step=8)
+        # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model')
+        decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method')
+        temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
+        top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
+        
+        # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None')
+        st.subheader("LoRA Adapters (optional)")
+        # Text input for PEFT model ID
+        peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID")
+        # Button to load the adapter
+        if st.button("Load Adapter"):
+            if peft_id:
+                with st.spinner("Checking PEFT availability..."):
+                    is_available = check_model_availability(peft_id)
+                if is_available:
+                    if peft_id not in st.session_state.added_adapters:
+                        st.session_state.added_adapters.append(peft_id)
+                        st.success(f"Successfully added PEFT: {peft_id}")
+                    else:
+                        st.warning(f"PEFT {peft_id} is already in the list.")
+                else:
+                    st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.")
+            else:
+                st.warning("Please enter a PEFT Model ID.")
+        # Button to remove all adapters
+        if st.button("Remove All Adapters"):
+            st.session_state.added_adapters = []
+            st.success("All adapters have been removed.")
+        # Display the list of added adapters
+        st.markdown("**Added Adapters:**")
+        if st.session_state.added_adapters:
+            for adapter in st.session_state.added_adapters:
+                st.write(f"- {adapter}")
+        else:
+            st.write("No adapters added yet.")
+        # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!')
+    elif page == "Finetune":
+        st.header("🏋️‍♂️ LoRA Finetuning")
+        
+        # Hugging Face token input
+        # hf_token = st.text_input("Enter your Hugging Face token:", type="password")
+        if 'hf_token' in st.session_state.keys():
+            st.success('HF token already provided!', icon='✅')
+            hf_token = st.session_state.hf_token
+        else:
+            hf_token = st.text_input('Enter your Hugging Face token:', type='password')
+            if not (hf_token.startswith('hf_') and len(hf_token)==37):
+                st.warning('please enter a valid token', icon='⚠️')
+            else:
+                st.success('Proceed to finetuning your model!', icon='👉')
+                st.session_state.hf_token = hf_token
+        
+        # PEFT model name
+        peft_model_name = st.text_input("Enter the PEFT model name:", help="The name of the PEFT model should start with the username associated with the provided HF token, followed by '/'ß. E.g. 'username/peft-base-uncased'")
+        
+        # Dataset selection
+        dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"])
+        
+        if dataset_option == "Upload JSON":
+            uploaded_file = st.file_uploader("Upload JSON dataset", type="json")
+            if uploaded_file is not None:
+                dataset = json.load(uploaded_file)
+                st.success("Dataset uploaded successfully!")
+        else:
+            dataset_name = st.text_input("Enter Hugging Face dataset name:")
+        
+        # Finetuning parameters
+        st.subheader("Finetuning parameters")
+        lora_rank = st.number_input("LoRA rank", min_value=2, max_value=64, value=16, step=2)
+        lora_alpha = st.number_input("LoRA alpha", min_value=2, max_value=64, value=16, step=2)
+        target_modules = st.multiselect("Target modules", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"], default=["down_proj"])
+        learning_rate = st.number_input("Learning rate", min_value=1e-6, max_value=1e-3, value=1e-5, step=1e-6)
+        optimizer_type = st.selectbox("Optimizer type", ["SGD", "Adam", "AdamW", "Adagrad", "Adadelta", "Adamax", "RMSprop"])
+        momentum = st.number_input("Momentum", min_value=0.0, max_value=1.0, value=0.0, step=0.01)
+        weight_decay = st.number_input("Weight decay", min_value=0.0, max_value=1.0, value=0.0, step=0.01)
+        nesterov = st.checkbox("Nesterov")
+        max_steps = st.number_input("Max steps", min_value=1000, max_value=100000, value=10000, step=1000)
+        
+        # Start finetuning button
+        if st.button("Start Finetuning"):
+            if not hf_token:
+                st.error("Please enter your Hugging Face token.")
+            elif dataset_option == "Upload JSON" and uploaded_file is None:
+                st.error("Please upload a JSON dataset.")
+            elif dataset_option == "Hugging Face Dataset" and not dataset_name:
+                st.error("Please enter a Hugging Face dataset name.")
+            else:
+                # Prepare the request data
+                request_data = {
+                    "token": hf_token,
+                    "dataset_source": dataset_option,
+                }
+                
+                if dataset_option == "Upload JSON":
+                    request_data["dataset"] = dataset
+                else:
+                    request_data["dataset_name"] = dataset_name
+                
+                # Send finetuning request to FastAPI server
+                with st.spinner("Finetuning in progress..."):
+                    response = requests.post(FINETUNE_URL, json=request_data)
+                
+                if response.status_code == 200:
+                    st.success("Finetuning completed successfully!")
+                else:
+                    st.error(f"Finetuning failed. Error: {response.status_code} - {response.text}")
+
+if page == "Chat":
+    # Display or clear chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+
+    # User-provided prompt
+    if prompt := st.chat_input():
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.write(prompt)
+
+    # Generate a new response if last message is not from assistant
+    if st.session_state.messages[-1]["role"] != "assistant":
+        with st.chat_message("assistant"):
+            with st.spinner("Running..."):
+                response = generate_llama3_response(prompt)
+                placeholder = st.empty()
+                full_response = ''
+                for item in response:
+                    full_response += item
+                    placeholder.markdown(full_response)
+                placeholder.markdown(full_response)
+        message = {"role": "assistant", "content": full_response}
+        st.session_state.messages.append(message)
+elif page == "Finetune":
+    st.write("Use the sidebar to configure and start finetuning.")
\ No newline at end of file
diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
new file mode 100644
index 0000000000..6ac7f4149a
--- /dev/null
+++ b/inference/python/streamlit/fastapi_incr.py
@@ -0,0 +1,207 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Running Instructions:
+- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed.
+- Save this script as 'fastapi_incr.py'.
+- Run the application using the command: `uvicorn fastapi_incr:app --reload --port PORT_NUMBER`
+- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests.
+- Go to `http://localhost:PORT_NUMBER/docs` for API documentation.
+"""
+
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+import flexflow.serve as ff
+import uvicorn
+import json, os, argparse
+from types import SimpleNamespace
+from typing import Optional, List
+import time
+
+
+# Initialize FastAPI application
+app = FastAPI()
+
+# Define the request model
+class PromptRequest(BaseModel):
+    prompt: str
+
+# data models
+class Message(BaseModel):
+    role: str
+    content: str
+
+
+# class ChatCompletionRequest(BaseModel):
+#     model: Optional[str] = "mock-gpt-model"
+#     messages: List[Message]
+#     max_tokens: Optional[int] = 512
+#     temperature: Optional[float] = 0.1
+#     stream: Optional[bool] = False
+
+class ChatCompletionRequest(BaseModel):
+    max_new_tokens: Optional[int] = 1024
+    messages: List[Message]
+
+# Global variable to store the LLM model
+llm = None
+
+
+def get_configs():
+    
+    # Fetch configuration file path from environment variable
+    config_file = os.getenv("CONFIG_FILE", "")
+
+    # Load configs from JSON file (if specified)
+    if config_file:
+        if not os.path.isfile(config_file):
+            raise FileNotFoundError(f"Config file {config_file} not found.")
+        try:
+            with open(config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 8,
+            "memory_per_gpu": 20000,
+            "zero_copy_memory_per_node": 40000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 8,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 4,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "profiling": False,
+            "benchmarking": False,
+            "inference_debugging": False,
+            "fusion": True,
+        }
+        llm_configs = {
+            # required parameters
+            "llm_model": "meta-llama/Llama-3.1-8B-Instruct",
+            # optional parameters
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "refresh_cache": False,
+            "full_precision": False,
+            "prompt": "",
+            "output_file": "",
+        }
+        # Merge dictionaries
+        ff_init_configs.update(llm_configs)
+        return ff_init_configs
+    
+
+# Initialize model on startup
+@app.on_event("startup")
+async def startup_event():
+    global llm
+
+    # Initialize your LLM model configuration here
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+    ff.init(configs_dict)
+
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.llm_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=16,
+        max_seq_length=2048,
+        max_tokens_per_batch=1024,
+    )
+    llm.start_server()
+
+# API endpoint to generate response
+@app.post("/generate/")
+async def generate(prompt_request: PromptRequest):
+    if llm is None:
+        raise HTTPException(status_code=503, detail="LLM model is not initialized.")
+    
+    # Call the model to generate a response
+    full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+    
+    # Separate the prompt and response
+    split_output = full_output.split('\n', 1)
+    if len(split_output) > 1:
+        response_text = split_output[1] 
+    else:
+        response_text = "" 
+        
+    # Return the prompt and the response in JSON format
+    return {
+        "prompt": prompt_request.prompt,
+        "response": response_text
+    }
+
+@app.post("/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+
+    if llm is None:
+        raise HTTPException(status_code=503, detail="LLM model is not initialized.")
+    
+    print("received request:", request)
+    result = llm.generate([message.dict() for message in request.messages], max_new_tokens=request.max_new_tokens)[0].output_text.decode('utf-8')
+    print("returning response:", result)
+    return {
+        "response": result
+    }
+    return {
+        "id": "1337",
+        "object": "chat.completion",
+        "created": time.time(),
+        "model": request.model,
+        "choices": [{"message": Message(role="assistant", content=resp_content)}],
+    }
+
+# Shutdown event to stop the model server
+@app.on_event("shutdown")
+async def shutdown_event():
+    global llm
+    if llm is not None:
+        llm.stop_server()
+
+# Main function to run Uvicorn server
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+
+# Running within the entrypoint folder:
+# uvicorn fastapi_incr:app --reload --port
+
+# Running within the python folder:
+# uvicorn entrypoint.fastapi_incr:app --reload --port 3000
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index 38dd577574..2ee63b10bc 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python
 import flexflow.serve as ff
 import argparse, os
+from peft import PeftConfig
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", type=str, help="Name of the model to download"
-    )
     parser.add_argument(
         "peft_model_ids",
         type=str,
@@ -48,19 +46,21 @@ def main(args):
     else:
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 
-    for data_type in data_types:
-        llm = ff.LLM(
-            args.base_model_name,
-            data_type=data_type,
-            cache_path=args.cache_folder,
-            refresh_cache=args.refresh_cache,
-        )
-        for peft_model_id in args.peft_model_ids:
-            lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id)
-            llm.add_peft(lora_config)
-        llm.download_hf_weights_if_needed()
-        llm.download_hf_config()
-        llm.download_hf_tokenizer_if_needed()
+    for peft_model_id in args.peft_model_ids:
+        hf_config = PeftConfig.from_pretrained(peft_model_id)
+        for data_type in data_types:
+            llm = ff.LLM(
+                hf_config.base_model_name_or_path,
+                data_type=data_type,
+                cache_path=args.cache_folder,
+                refresh_cache=args.refresh_cache,
+            )
+            # Download base model config, weights and tokenizer
+            llm.download_hf_config()
+            llm.download_hf_weights_if_needed()
+            llm.download_hf_tokenizer_if_needed()
+            # Download PEFT adapter
+            llm.download_peft_adapter_if_needed(peft_model_id)
 
 
 if __name__ == "__main__":
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index b8ed15eaea..52fe331bf3 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -91,7 +91,6 @@
     "use_8bit_quantization": "--8bit-quantization",
     "enable_peft": "-enable-peft",
     "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
-    "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }
 
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 59e62ea023..02eff0ca76 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -811,6 +811,10 @@ def pipeline_parallelism_degree(self, value):
     @property
     def python_data_loader_type(self):
         return ffc().flexflow_config_get_python_data_loader_type(self.handle)
+    
+    @property
+    def enable_peft(self):
+        return ffc().flexflow_config_get_enable_peft(self.handle)
 
     @property
     def cpu_offload(self):
@@ -1629,6 +1633,11 @@ def set_max_sequence_length(self, max_length):
 
     def get_max_sequence_length(self):
         return ffc().flexflow_request_manager_get_max_sequence_length(self.handle)
+    
+    def set_max_concurrent_adapters(self, max_adapters):
+        return ffc().flexflow_request_manager_set_max_concurrent_adapters(
+            self.handle, max_adapters
+        )
 
     def set_enable_peft_finetuning(self, enable_peft_finetuning):
         return ffc().flexflow_request_manager_set_enable_peft_finetuning(
@@ -4288,8 +4297,12 @@ def argmax(self, input, beam_search, name=None):
         self.add_layer(OpType.ARGMAX, name)
         return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
-    def add_lora_layer(self, peft_config):
-        return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle)
+    def add_lora_layers(self, target_modules: List[str]):
+        c_target_modules = [get_c_name(module) for module in target_modules]
+        return ffc().flexflow_model_add_lora_layers(self.handle, len(target_modules), c_target_modules)
+    
+    def register_peft_adapter(self, peft_config):
+        return ffc().flexflow_model_register_peft_adapter(self.handle, peft_config.handle)
 
     def reset_metrics(self):
         """Reset performance metrics.
@@ -4751,6 +4764,7 @@ def generate(self, requests_list: List[Request]):
                     finetuning_losses=finetuning_losses,
                 )
             )
+        return results
 
     def set_position_offset(self, offset):
         ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index fd29080a6a..55044d1838 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -55,7 +55,6 @@ def init(
     use_8bit_quantization: Optional[bool] = None,
     enable_peft: Optional[bool] = None,
     peft_activation_reserve_space_size: Optional[int] = None,
-    peft_weight_reserve_space_size: Optional[int] = None,
     profiling: Optional[bool] = None,
     benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
@@ -86,7 +85,6 @@ def init(
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     - enable_peft: whether to enable the use of PEFT, defaults to False
     - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
-    - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -125,8 +123,6 @@ def init(
     :type enable_peft: Optional[bool], optional
     :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
     :type peft_activation_reserve_space_size: Optional[int], optional
-    :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
-    :type peft_weight_reserve_space_size: Optional[int], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
@@ -158,7 +154,6 @@ def init(
             use_8bit_quantization is not None,
             enable_peft is not None,
             peft_activation_reserve_space_size is not None,
-            peft_weight_reserve_space_size is not None,
             profiling is not None,
             benchmarking is not None,
             inference_debugging is not None,
@@ -187,7 +182,6 @@ def init(
             "use_8bit_quantization": use_8bit_quantization,
             "enable_peft": enable_peft,
             "peft_activation_reserve_space_size": peft_activation_reserve_space_size,
-            "peft_weight_reserve_space_size": peft_weight_reserve_space_size,
             "profiling": profiling,
             "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
@@ -210,7 +204,6 @@ def init(
         "pipeline_parallelism_degree",
         "offload_reserve_space_size",
         "peft_activation_reserve_space_size",
-        "peft_weight_reserve_space_size",
     ]
     for param in positive_int_params:
         __check_positive_int(configs_dict, param)
@@ -238,8 +231,6 @@ def init(
         configs_dict["enable_peft"] = False
     if configs_dict.get("peft_activation_reserve_space_size", None) is None:
         configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3
-    if configs_dict.get("peft_weight_reserve_space_size", None) is None:
-        configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("benchmarking", None) is None:
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 0c6102406f..60aa3c27e9 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -257,6 +257,10 @@ def build_model(self, max_tokens_per_batch):
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
                 softmax = ffmodel.softmax(lm_head, -1)
                 output = ffmodel.argmax(softmax, False)
+        
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["dense_h_to_4h", "dense_4h_to_h"])
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index e149834603..ceea9e96b0 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -264,6 +264,10 @@ def build_model(self, max_tokens_per_batch):
                 # output = ffmodel.arg_top_k(dense, 1, False)
                 softmax = ffmodel.softmax(dense, -1)
                 output = ffmodel.argmax(softmax, False)
+        
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["gate_proj", "up_proj", "down_proj"])
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index a0e70b381a..d927a1fbb3 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -258,6 +258,10 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(lm_head, -1)
             output = ffmodel.argmax(softmax, False)
 
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["up_proj", "down_proj"])
+        
         self.ffmodel = ffmodel
 
     # TODO: finish this
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index ba2e21b690..e8d6fec9af 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -287,6 +287,10 @@ def build_model(self, max_tokens_per_batch):
                 softmax = ffmodel.softmax(lm_head, -1)
                 output = ffmodel.argmax(softmax, False)
 
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["fc1", "fc2"])
+        
         self.ffmodel = ffmodel
 
     def convert_hf_weight_name(name):
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index dc5faf175f..107614e9dd 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -228,6 +228,10 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(lm_head, -1)
             output = ffmodel.argmax(softmax, False)
 
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["c_fc", "c_proj"])
+        
         self.ffmodel = ffmodel
 
     def convert_hf_model(model, dst_folder):
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index e4248a2fc1..c2804b6966 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -31,9 +31,17 @@
 from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
 import torch, shutil, hashlib, json, gc
-from typing import Union, List
+from typing import Union, List, Tuple
+from safetensors import safe_open
 from huggingface_hub import snapshot_download
 
+from enum import Enum
+
+
+class CachedResourceType(Enum):
+    TOKENIZER = "tokenizer"
+    WEIGHTS = "weights"
+
 
 class _SupportedModels:
     def __init__(
@@ -104,14 +112,14 @@ def __init__(
         self.output_file = output_file
         self.rm = None
         self.pefts = {}
-        self.tokenizer=None
+        self.tokenizer = None
 
     def __del__(self):
         # Stop the background server before deleting the object
         if type(self) == LLM and self.rm is not None:
             self.rm.stop_server()
 
-    def add_peft(self, lora_config: LoraLinearConfig):
+    def register_peft_adapter(self, lora_config: LoraLinearConfig):
         """Add a PEFT adapter to the LLM"""
         if lora_config is None:
             raise ValueError("lora_config cannot be None")
@@ -145,9 +153,12 @@ def add_peft(self, lora_config: LoraLinearConfig):
                 f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
             )
 
+        lora_config.ff_compile()
+
         self.pefts[lora_config] = {
             "peft_config": peft_config,
             "peft_type": peft_config.peft_type,
+            "ff_peft_model_id": self.model.ffmodel.register_peft_adapter(lora_config),
         }
 
     def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID:
@@ -175,34 +186,33 @@ def download_hf_config(self):
         os.makedirs(config_dir, exist_ok=True)
         print(f"Creating directory {config_dir} (if it doesn't exist)...")
         print(f"Saving {self.model_name} configs to file {config_path}...")
-        self.hf_config.to_json_file(config_path)
-
-        # Save PEFT configs if the LLM has any registered PEFTs
-        for ff_peft_config, peft_dict in self.pefts.items():
-            peft_config = peft_dict["peft_config"]
-            peft_model_id = ff_peft_config.peft_model_id
-            peft_config_dir = os.path.join(
-                os.path.expanduser(self.cache_path), "configs", peft_model_id.lower()
-            )
-            os.makedirs(peft_config_dir, exist_ok=True)
-            peft_config_path = os.path.join(peft_config_dir, "config.json")
-            print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
-            with open(peft_config_path, "w") as json_file:
-
-                class SetEncoder(json.JSONEncoder):
-                    def default(self, obj):
-                        if isinstance(obj, set):
-                            return list(obj)
-                        return super().default(obj)
-
-                json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
-
-    def __get_revision_hashes(self, model_name: str, folder: str):
+        # self.hf_config.to_json_file(config_path)
+        src_folder = snapshot_download(
+            repo_id=self.model_name, allow_patterns="config.json"
+        )
+        src_path = os.path.join(src_folder, "config.json")
+        if os.path.exists(src_path):
+            shutil.copy(src_path, config_path)
+
+    def __get_revision_hashes(
+        self, model_name: str, folder: str
+    ) -> Tuple[Union[str, None], str, str]:
+        """Return the commit hash of the object (weight, tokenizer, etc) cached by FlexFlow and the latest commit hash of the object from HuggingFace (or other source)
+
+        Args:
+            model_name (str): Name of the model cached by FlexFlow
+            folder (str): Folder where the cached object is stored
+
+        Returns:
+            ff_revision: Commit hash of the object cached by FlexFlow
+            ff_revision_filepath: Path to the file containing the commit hash of the object cached by FlexFlow
+            latest_revision: Latest commit hash of the object from HuggingFace (or other source)
+        """
         ff_revision = None
-        ff_revision_file = os.path.join(folder, "rev_sha.txt")
+        ff_revision_filepath = os.path.join(folder, "rev_sha.txt")
 
-        if os.path.exists(ff_revision_file):
-            ff_revision = "".join(open(ff_revision_file).read().split())
+        if os.path.exists(ff_revision_filepath):
+            ff_revision = "".join(open(ff_revision_filepath).read().split())
 
         if os.path.exists(model_name) and os.path.isdir(model_name):
             # Local model
@@ -215,16 +225,21 @@ def __get_revision_hashes(self, model_name: str, folder: str):
             # Remote HuggingFace model
             hf_api = HfApi()
             latest_revision = hf_api.model_info(self.model_name).sha
-        return ff_revision, ff_revision_file, latest_revision
+        return ff_revision, latest_revision
 
-    def download_hf_weights_if_needed(self):
-        """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
-        If not, or if the refresh_cache parameter is set to True, download new weights.
+    def __get_resource_path(
+        self, model_name: str, resource_type: CachedResourceType
+    ) -> str:
+        """Returns the path to the folder where the model weights or tokenizer files are stored
 
-        If any PEFT adapter is registered, perform the same operation for PEFT.
-        """
+        Args:
+            model_name (str): Name of the model
+            resource_type (CachedResourceType): Whether to get the path to the weights or the tokenizer
 
-        def get_weights_path(model_name):
+        Returns:
+            str: Path to the folder where the model weights or tokenizer files are stored
+        """
+        if resource_type == CachedResourceType.WEIGHTS:
             return os.path.join(
                 os.path.expanduser(self.cache_path),
                 "weights",
@@ -235,19 +250,49 @@ def get_weights_path(model_name):
                     else "half-precision"
                 ),
             )
+        elif resource_type == CachedResourceType.TOKENIZER:
+            return os.path.join(
+                os.path.expanduser(self.cache_path), "tokenizers", model_name.lower()
+            )
+        else:
+            raise ValueError(f"Invalid resource type {resource_type}")
 
-        def refresh_cache_if_needed(model_name):
-            weights_path = get_weights_path(model_name)
-            if self.refresh_cache:
-                print(
-                    f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
-                )
-                if os.path.exists(weights_path):
-                    shutil.rmtree(weights_path)
-            os.makedirs(weights_path, exist_ok=True)
+    def __need_cache_refresh(
+        self, model_name: str, resource_type: CachedResourceType
+    ) -> bool:
+        """Check whether the model weights or tokenizer files are available and up to date.
+        If they need a refresh, create the folder for the resource, save the new commit hash to the rev_sha.txt file, delete any existing files, and return true.
 
-        def get_hf_llm(model_name):
-            return AutoModelForCausalLM.from_pretrained(
+        Args:
+            model_name (str): Name of the model to check
+            resource_type (CachedResourceType): Whether to check the weights or the tokenizer
+
+        Returns:
+            bool: True if the weights or tokenizer need a refresh, False otherwise
+        """
+        resource_path = self.__get_resource_path(model_name, resource_type)
+        ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path)
+        if self.refresh_cache or not os.path.exists(resource_path) or ff_revision != latest_revision:
+            print(
+                f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..."
+            )
+            if os.path.exists(resource_path):
+                shutil.rmtree(resource_path)
+            os.makedirs(resource_path, exist_ok=True)
+            ff_revision_file = os.path.join(resource_path, "rev_sha.txt")
+            with open(ff_revision_file, "w+") as f:
+                f.write(latest_revision)
+            return True
+        return False
+
+    def download_hf_weights_if_needed(self) -> None:
+        """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
+        If not, or if the refresh_cache parameter is set to True, download new weights and convert them.
+        """
+
+        # TODO: edit this to download the weights using snapshot_download and convert them to FlexFlow format without loading them to GPU
+        def download_and_convert_llm_weights(model_name):
+            hf_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 trust_remote_code=True,
                 torch_dtype=(
@@ -256,73 +301,26 @@ def get_hf_llm(model_name):
                     else torch.float16
                 ),
             )
-
-        def download_llm_weights():
-            refresh_cache_if_needed(self.model_name)
-            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-                self.model_name, self.weights_path
+            # Convert the model to FlexFlow format
+            weights_path = self.__get_resource_path(
+                model_name, CachedResourceType.WEIGHTS
             )
-            if ff_revision != latest_revision:
-                print(
-                    f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
-                )
-                hf_model = get_hf_llm(self.model_name)
-                # Convert the model to FlexFlow format
-                self.model_class.convert_hf_model(hf_model, self.weights_path)
-                # Save new revision hash to file
-                with open(ff_revision_file, "w+") as f:
-                    f.write(latest_revision)
-                print(f"Done converting the weights for model {self.model_name}")
-                # Deallocate hf model
-                del hf_model
-                gc.collect()
-                torch.cuda.empty_cache()
-
-        def convert_peft_model(hf_peft_model, peft_type, weights_path):
-            for name, params in hf_peft_model.named_parameters():
-                if peft_type.lower() in name:
-                    name = name.replace("base_model.model.model.", "").replace(
-                        ".default", ""
-                    )
-                    name = self.model_class.convert_hf_weight_name(name)
-                    params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
-
-        def download_peft_weights():
-            for ff_peft_config, peft_dict in self.pefts.items():
-                if not ff_peft_config.init_lora_weights:
-                    peft_config = peft_dict["peft_config"]
-                    peft_type = peft_dict["peft_type"]
-                    peft_model_id = ff_peft_config.peft_model_id
-
-                    weights_path = get_weights_path(peft_model_id)
-                    refresh_cache_if_needed(peft_model_id)
-                    ff_revision, ff_revision_file, latest_revision = (
-                        self.__get_revision_hashes(peft_model_id, weights_path)
-                    )
-
-                    if ff_revision != latest_revision:
-                        print(
-                            f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
-                        )
-                        hf_model = get_hf_llm(peft_model_id)
-                        hf_peft_model = PeftModel.from_pretrained(
-                            hf_model, peft_model_id, config=peft_config
-                        )
-                        # Convert the model to FlexFlow format
-                        convert_peft_model(hf_peft_model, peft_type, weights_path)
-                        # Save new revision hash to file
-                        with open(ff_revision_file, "w+") as f:
-                            f.write(latest_revision)
-                        print(f"Done converting the weights for model {peft_model_id}")
-                        # Deallocate hf model
-                        del hf_peft_model
-                        del hf_model
-                        gc.collect()
-                        torch.cuda.empty_cache()
-
-        self.weights_path = get_weights_path(self.model_name)
-        download_llm_weights()
-        download_peft_weights()
+            self.model_class.convert_hf_model(hf_model, weights_path)
+            # Save new revision hash to file
+            print(f"Done converting the weights for model {self.model_name}")
+            # Deallocate hf model
+            del hf_model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        need_refresh = self.__need_cache_refresh(
+            self.model_name, CachedResourceType.WEIGHTS
+        )
+        if need_refresh:
+            print(
+                f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
+            )
+            download_and_convert_llm_weights(self.model_name)
 
     def download_hf_tokenizer_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date.
@@ -331,25 +329,10 @@ def download_hf_tokenizer_if_needed(self):
         print("Loading tokenizer...")
 
         # Use local cache, or download new version
-        self.tokenizer_path = os.path.join(
-            os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower()
+        need_refresh = self.__need_cache_refresh(
+            self.model_name, CachedResourceType.TOKENIZER
         )
-        if self.refresh_cache:
-            print(
-                f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
-            )
-            if os.path.exists(self.tokenizer_path):
-                shutil.rmtree(self.tokenizer_path)
-        if not os.path.exists(self.tokenizer_path):
-            print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...")
-            os.makedirs(self.tokenizer_path, exist_ok=True)
-
-        # Get local revision SHA, check if it matches latest one on huggingface
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, self.tokenizer_path
-        )
-
-        if ff_revision != latest_revision:
+        if need_refresh:
             print(
                 f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
             )
@@ -367,15 +350,76 @@ def download_hf_tokenizer_if_needed(self):
                 hf_tokenizer_path = snapshot_download(
                     repo_id=self.model_name, allow_patterns=target_tokenizer_files
                 )
+            tokenizer_path = self.__get_resource_path(
+                self.model_name, CachedResourceType.TOKENIZER
+            )
             for file in target_tokenizer_files:
                 src_path = os.path.join(hf_tokenizer_path, file)
-                dst_path = os.path.join(self.tokenizer_path, file)
+                dst_path = os.path.join(tokenizer_path, file)
                 if os.path.exists(src_path):
                     shutil.copy(src_path, dst_path)
             print("Done updating HF tokenizer.")
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
+
+    def download_peft_adapter_if_needed(self, hf_peft_model_id: str):
+        """Check in the folder specified by the cache_path whether the PEFT model weights are available and up to date.
+        If not, or if the refresh_cache parameter is set to True, download new weights and convert them.
+        """
+
+        def download_and_convert_peft_model(hf_peft_model_id: str):
+            if (
+                self.data_type != DataType.DT_FLOAT
+                and self.data_type != DataType.DT_HALF
+            ):
+                raise ValueError(
+                    "data_type must be either DataType.DT_FLOAT or DataType.DT_HALF"
+                )
+
+            # Save peft config to file
+            peft_config_dir = os.path.join(
+                os.path.expanduser(self.cache_path), "configs", hf_peft_model_id.lower()
+            )
+            dst_path = os.path.join(peft_config_dir, "config.json")
+            os.makedirs(peft_config_dir, exist_ok=True)
+            print(f"Saving {hf_peft_model_id} configs to file {dst_path}...")
+            config_path = snapshot_download(
+                repo_id=hf_peft_model_id, allow_patterns="adapter_config.json"
+            )
+            src_path = os.path.join(config_path, "adapter_config.json")
+            if os.path.exists(src_path):
+                shutil.copy(src_path, dst_path)
+
+            # Save peft weights to file
+            adapter_path = snapshot_download(
+                repo_id=hf_peft_model_id, allow_patterns="adapter_model.safetensors"
+            )
+            weights_path = self.__get_resource_path(
+                hf_peft_model_id.lower(), CachedResourceType.WEIGHTS
+            )
+            with safe_open(adapter_path, framework="pt", device="cpu") as f:
+                for tensor_name in f.keys():
+                    tensor = f.get_tensor(tensor_name)
+                    if self.data_type == DataType.DT_HALF:
+                        tensor = tensor.half()
+                    else:
+                        tensor = tensor.float()
+                    tensor_name = tensor_name.replace(
+                        "base_model.model.model.", ""
+                    ).replace(".default", "")
+                    print(tensor_name)
+
+                    tensor_name = self.model_class.convert_hf_weight_name(tensor_name)
+                    tensor.detach().cpu().numpy().tofile(
+                        f"{weights_path}/{tensor_name}"
+                    )
+
+        need_refresh = self.__need_cache_refresh(
+            hf_peft_model_id, CachedResourceType.WEIGHTS
+        )
+        if need_refresh:
+            print(
+                f"'{hf_peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
+            )
+            download_and_convert_peft_model(hf_peft_model_id)
 
     def compile(
         self,
@@ -383,10 +427,8 @@ def compile(
         max_requests_per_batch: int = 1,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 64,
+        max_concurrent_adapters: int = 1,
         enable_peft_finetuning: bool = False,
-        model_specific_data_parallelism_degree: int = None,
-        model_specific_tensor_parallelism_degree: int = None,
-        model_specific_pipeline_parallelism_degree: int = None,
         ssms: list = [],
     ):
         """Compile the LLM for inference and load the weights into memory
@@ -399,14 +441,10 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64
         :type max_tokens_per_batch: int, optional
+        :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1
+        :type max_concurrent_adapters: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional
-        :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None
-        :type model_specific_data_parallelism_degree: int, optional
-        :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None
-        :type model_specific_tensor_parallelism_degree: int, optional
-        :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the LLM a different pipeline parallelism degree than the one used to initialize the runtime, defaults to None
-        :type model_specific_pipeline_parallelism_degree: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
@@ -418,24 +456,13 @@ def compile(
             mode = InferenceMode.TREE_VERIFY_MODE
         elif type(self) == SSM:
             mode = InferenceMode.BEAM_SEARCH_MODE
+            self.ffconfig.data_parallelism_degree = 1
+            self.ffconfig.tensor_parallelism_degree = 1
+            self.ffconfig.pipeline_parallelism_degree = 1
         else:
             assert type(self) == LLM
             mode = InferenceMode.INC_DECODING_MODE
 
-        # Apply model-specific parallelism degrees, if needed
-        if model_specific_data_parallelism_degree:
-            self.ffconfig.data_parallelism_degree = (
-                model_specific_data_parallelism_degree
-            )
-        if model_specific_tensor_parallelism_degree:
-            self.ffconfig.tensor_parallelism_degree = (
-                model_specific_tensor_parallelism_degree
-            )
-        if model_specific_pipeline_parallelism_degree:
-            self.ffconfig.pipeline_parallelism_degree = (
-                model_specific_pipeline_parallelism_degree
-            )
-
         self.max_seq_length = max_seq_length
 
         # Create request manager and set serving configuration
@@ -443,6 +470,7 @@ def compile(
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
         self.rm.set_max_tokens_per_batch(max_tokens_per_batch)
         self.rm.set_max_sequence_length(max_seq_length)
+        self.rm.set_max_concurrent_adapters(max_concurrent_adapters)
         self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
 
         # Instantiate the relevant model
@@ -464,12 +492,6 @@ def compile(
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
-        # Add PEFT layer if registered
-        for ff_peft_config, peft_dict in self.pefts.items():
-            ff_peft_config.ff_compile()
-            ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
-            peft_dict["ff_peft_model_id"] = ff_peft_model_id
-
         # Create file data loader, load weights into tensors
         model_configs = self.config_class(self.hf_config)
 
@@ -479,8 +501,11 @@ def compile(
             else 20
         )
 
+        weights_path = self.__get_resource_path(
+            self.model_name, CachedResourceType.WEIGHTS
+        )
         self.fileloader = FileDataLoader(
-            self.weights_path,
+            weights_path,
             model_configs.num_attention_heads,
             model_configs.num_key_value_heads,
             model_configs.hidden_size,
@@ -504,8 +529,11 @@ def compile(
             eos_token_id = [eos_token_id]
         elif type(eos_token_id) != list:
             raise ValueError("eos_token_id must be an integer or a list of integers")
+        tokenizer_path = self.__get_resource_path(
+            self.model_name, CachedResourceType.TOKENIZER
+        )
         self.rm.register_tokenizer(
-            self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
+            self.model_type, bos_token_id, eos_token_id, tokenizer_path
         )
         self.rm.register_output_filepath(self.output_file)
 
@@ -520,14 +548,14 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def _generate(self, requests: List[Request]):
+    def _generate(self, requests: List[Request]) -> List[GenerationResult]:
         if len(requests) == 0:
             return []
         for req in requests:
             if req.req_type == RequestType.REQ_INFERENCE:
                 # check max_length and max_new_tokens parameters
                 if req.max_length == -1 and req.max_new_tokens == -1:
-                    req.max_length = self.max_seq_length -1
+                    req.max_length = self.max_seq_length - 1
                 elif req.max_length != -1 and req.max_new_tokens != -1:
                     warnings.warn(
                         f"Both `max_new_tokens` (={req.max_new_tokens}) and `max_length`(={req.max_length}) seem to have been set. `max_new_tokens` will take precedence."
@@ -546,14 +574,14 @@ def _generate(self, requests: List[Request]):
                         f"max_new_tokens ({req.max_new_tokens}) is not allowed for finetuning requests."
                     )
                 if req.max_length == -1:
-                    req.max_length = self.max_seq_length -1
+                    req.max_length = self.max_seq_length - 1
                 if req.max_length >= self.max_seq_length:
                     raise ValueError(
                         f"max_length ({req.max_length}) exceeds the maximum sequence length ({self.max_seq_length})"
                     )
         return self.model.ffmodel.generate(requests)
 
-    def __chat2prompt(self, messages: List[dict]):
+    def __chat2prompt(self, messages: List[dict]) -> str:
         """Convert a list of messages to a single prompt string
 
         :param messages: The list of messages to convert
@@ -563,15 +591,31 @@ def __chat2prompt(self, messages: List[dict]):
         """
         # ensure that each element is a dictionary, containing the "role" and "content" keys
         for message in messages:
-            if type(message) != dict or "role" not in message or "content" not in message:
+            if (
+                type(message) != dict
+                or "role" not in message
+                or "content" not in message
+            ):
                 raise ValueError(
                     "Each element in the list must be a dictionary with the keys 'role' and 'content'"
                 )
         if self.tokenizer is None:
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         if self.tokenizer.chat_template is None:
-            raise ValueError(f"Model {self.model_name} does not support chat completion")
-        return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            raise ValueError(
+                f"Model {self.model_name} does not support chat completion"
+            )
+        return self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+    def __output2chat_response(
+        self, requests: List[Request], outputs: List[GenerationResult]
+    ) -> List[GenerationResult]:
+        assert len(requests) == len(outputs)
+        for i in range(len(outputs)):
+            outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt) :]
+        return outputs
 
     def generate(
         self,
@@ -625,9 +669,12 @@ def generate(
                     max_new_tokens=max_new_tokens,
                     add_special_tokens=False,
                 )
-                return self._generate([request])
+                outputs = self._generate([request])
+                return self.__output2chat_response([request], outputs)
             elif type(requests_or_prompts[0]) == list:
-                prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts]
+                prompts = [
+                    self.__chat2prompt(messages) for messages in requests_or_prompts
+                ]
                 requests = [
                     Request(
                         req_type=RequestType.REQ_INFERENCE,
@@ -638,12 +685,15 @@ def generate(
                     )
                     for prompt in prompts
                 ]
-                return self._generate(requests)
+                outputs = self._generate(requests)
+                return self.__output2chat_response(requests, outputs)
             elif type(requests_or_prompts[0]) == Request:
                 print(requests_or_prompts)
                 return self._generate(requests_or_prompts)
         else:
-            assert False, "Please pass a string, list of strings, Request, or list of Requests"
+            assert (
+                False
+            ), "Please pass a string, list of strings, Request, or list of Requests"
 
     def start_server(self):
         self.rm.start_server(self.model.ffmodel)
@@ -685,11 +735,9 @@ def compile(
         generation_config: GenerationConfig = GenerationConfig(),
         max_requests_per_batch: int = 16,
         max_seq_length: int = 256,
-        max_tokens_per_batch: int = 128,
+        max_tokens_per_batch: int = 2048,
+        max_concurrent_adapters: int = 1,
         enable_peft_finetuning: bool = False,
-        model_specific_data_parallelism_degree: int = 1,
-        model_specific_tensor_parallelism_degree: int = 1,
-        model_specific_pipeline_parallelism_degree: int = 1,
         ssms: list = [],
     ):
         """Compile the SSM for inference and load the weights into memory
@@ -699,16 +747,12 @@ def compile(
         :type max_requests_per_batch: int, optional
         :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256
         :type max_seq_length: int, optional
-        :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128
+        :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 2048
         :type max_tokens_per_batch: int, optional
+        :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1
+        :type max_concurrent_adapters: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional
-        :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1
-        :type model_specific_data_parallelism_degree: int, optional
-        :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1
-        :type model_specific_tensor_parallelism_degree: int, optional
-        :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the SSM a different pipeline parallelism degree than the default one, defaults to 1
-        :type model_specific_pipeline_parallelism_degree: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
@@ -717,9 +761,7 @@ def compile(
             max_requests_per_batch,
             max_seq_length,
             max_tokens_per_batch,
+            max_concurrent_adapters,
             enable_peft_finetuning,
-            model_specific_data_parallelism_degree,
-            model_specific_tensor_parallelism_degree,
-            model_specific_pipeline_parallelism_degree,
             ssms,
         )
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index da90c586e3..e16b0e87bd 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -177,6 +177,11 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
   handle->pipeline_parallelism_degree = value;
 }
 
+bool flexflow_config_get_enable_peft(flexflow_config_t handle_) {
+  FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->enable_peft;
+}
+
 int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) {
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
   return handle->python_data_loader_type;
@@ -1608,18 +1613,33 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
 }
 
 #ifdef FF_BUILD_INFERENCE
-flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+void flexflow_model_add_lora_layers(flexflow_model_t handle_,
+                                    int num_target_modules,
+                                    char const **target_modules_) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  std::vector<std::string> target_modules;
+  for (int i = 0; i < num_target_modules; i++) {
+    target_modules.push_back(target_modules_[i]);
+  }
+  DEBUG_PRINT("[Add Lora Layers] model handle: %p, num_target_modules %d",
+              handle,
+              num_target_modules);
+  handle->add_lora_layers(target_modules);
+}
+
+flexflow_peft_model_id_t flexflow_model_register_peft_adapter(
     flexflow_model_t handle_,
     const flexflow_lora_linear_config_t peft_config_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
-  PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config);
+  PEFTModelID *peft_model_id = handle->register_peft_adapter(*peft_config);
 
-  DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, "
-              "peft_model_id: %p",
-              handle,
-              peft_config,
-              peft_model_id);
+  DEBUG_PRINT(
+      "[Register PEFT Adapter] model handle: %p, peft_config handle %p, "
+      "peft_model_id: %p",
+      handle,
+      peft_config,
+      peft_model_id);
   return FFCObjectWrapper::wrap(peft_model_id);
 }
 #endif
@@ -2765,6 +2785,14 @@ int flexflow_request_manager_get_max_sequence_length(
   return handle->get_max_sequence_length();
 }
 
+void flexflow_request_manager_set_max_concurrent_adapters(
+    flexflow_request_manager_t handle_, int max_concurrent_adapters) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_concurrent_adapters(max_concurrent_adapters);
+  DEBUG_PRINT("[RequestManager] set max_concurrent_adapters %d",
+              max_concurrent_adapters);
+}
+
 void flexflow_request_manager_set_enable_peft_finetuning(
     flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2909,7 +2937,9 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_) {
   FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_);
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
-  handle->load_weights(model);
+  Context ctx = model->config.lg_ctx;
+  Runtime *runtime = model->config.lg_hlr;
+  handle->load_weights_parallel(model, ctx, runtime);
 }
 
 // // -----------------------------------------------------------------------
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index d7b9a5e99d..c02f70f752 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -288,6 +288,10 @@ void FFMapper::select_task_options(const MapperContext ctx,
     output.initial_proc = all_cpus[0];
     return;
   }
+  if (task.task_id == LOAD_WEIGHT_TASK_ID) {
+    output.initial_proc = all_cpus[0];
+    return;
+  }
   if (task.task_id == TOP_LEVEL_TASK_ID) {
     output.initial_proc = all_cpus[0];
     // control replicate top level task
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 62845c0f8e..8635fd6a87 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -862,6 +862,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         int num_infr_tokens = bc->num_active_infr_tokens();
         int num_peft_tokens = bc->num_active_peft_tokens();
         Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 bc,
                                                  my_input_grad_accessor[0].ptr,
                                                  my_output_grad_accessor[0].ptr,
                                                  my_weight_accessor[0].ptr,
@@ -889,11 +890,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         // Assert that the output and the second input are at the same place
         // since we ``inplace'' the output for LoRA
         assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        int shard_id = task->index_point.point_data[0];
         Kernels::LoraLinear::peft_bwd_kernel_wrapper(
             ctx,
             runtime,
             m,
             bc,
+            shard_id,
             my_input_grad_accessor[0],
             my_output_grad_accessor[0]);
         break;
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 3832428c64..51954597d7 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -16,6 +16,7 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/lora_linear_params.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -73,6 +74,17 @@ LinearMeta::~LinearMeta(void) {
   }
 }
 
+bool lora_applies_to_this_layer(LinearMeta const *m,
+                                LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 namespace Kernels {
 namespace Linear {
 
@@ -285,6 +297,7 @@ void inference_kernel_wrapper(LinearMeta *m,
 }
 
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             BatchConfig const *bc,
                              void *input_grad_ptr,
                              void *output_grad_ptr,
                              void const *weight_ptr,
@@ -302,6 +315,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
   }
   if (m->input_type[0] == DT_FLOAT) {
     Internal::peft_bwd_kernel<float>(m,
+                                     bc,
                                      input_grad_ptr,
                                      output_grad_ptr,
                                      weight_ptr,
@@ -312,6 +326,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
                                      stream);
   } else if (m->input_type[0] == DT_HALF) {
     Internal::peft_bwd_kernel<half>(m,
+                                    bc,
                                     input_grad_ptr,
                                     output_grad_ptr,
                                     weight_ptr,
@@ -568,6 +583,7 @@ void forward_kernel(LinearMeta const *m,
 
 template <typename DT>
 void peft_bwd_kernel(LinearMeta const *m,
+                     BatchConfig const *bc,
                      void *input_grad_ptr,
                      void *output_grad_ptr,
                      void const *kernel_ptr,
@@ -611,6 +627,35 @@ void peft_bwd_kernel(LinearMeta const *m,
   // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
   DT alpha = 1.0f;
   DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+
+  // ensure that we only have one finetuning request, with a single lora
+  int num_peft_requests = 0;
+  bool lora_applies = false;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    num_peft_requests++;
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
+      continue;
+    }
+    lora_applies = true;
+  }
+  assert(num_peft_requests == 1 &&
+         "Exactly one PEFT finetuning request is required");
+  // if the request does not have any active lora in the current layer, reset
+  // beta to 0 std::cout << m->op_name << " original beta: " << (float)beta << "
+  // lora_applies: " << lora_applies << std::endl;
+  if (lora_applies) {
+    beta = 1.0f;
+  }
+
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 638cee8cae..40095484b5 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -23,29 +23,32 @@
 namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
-    : OpMeta(handler, li) {
-  allocated_peft_buffer_size1 = 0;
-  allocated_peft_buffer_size2 = 0;
-}
+    : OpMeta(handler, li) {}
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
 
-namespace Kernels {
-namespace LoraLinear {
-
-void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  if (m->input_type[0] == DT_FLOAT) {
-    Internal::init_kernel<float>(m, seed, stream);
-  } else if (m->input_type[0] == DT_HALF) {
-    Internal::init_kernel<half>(m, seed, stream);
+std::string
+    get_peft_dbg_folder(LoraLinearMeta const *m, int shard_id, bool is_fwd) {
+  std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m);
+  fs::path dst_filepath;
+  if (is_fwd) {
+    dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id);
   } else {
-    assert(false && "Unsupported data type");
+    dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
   }
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
 }
 
+namespace Kernels {
+namespace LoraLinear {
+
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
@@ -100,6 +103,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                              Runtime *runtime,
                              LoraLinearMeta *m,
                              BatchConfig const *bc,
+                             int shard_id,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
   cudaStream_t stream;
@@ -117,6 +121,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                                      runtime,
                                      m,
                                      bc,
+                                     shard_id,
                                      input_grad.get_float_ptr(),
                                      output_grad.get_float_ptr(),
                                      in_dim,
@@ -127,6 +132,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                                     runtime,
                                     m,
                                     bc,
+                                    shard_id,
                                     input_grad.get_half_ptr(),
                                     output_grad.get_half_ptr(),
                                     in_dim,
@@ -151,58 +157,19 @@ void peft_bwd_kernel_wrapper(Context ctx,
   }
 }
 
-namespace Internal {
-
-template <typename DT>
-void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) {
-  // Initialize generator
-  std::mt19937 gen(seed);
-
-  // Get handle to weights by iterating over m->model_state to get each
-  // LoraLinearWeight object
-  for (auto &model_state : m->model_state) {
-    LoraLinearWeight weight = model_state.second.weights;
-    int w0_num_elements = weight.rank * weight.in_dim;
-    int w1_num_elements = weight.rank * weight.out_dim;
-
-    // LoRA_A weight: [in_dim, rank]
-    float stdv_lora_a = 1.0f / sqrt(weight.in_dim);
-    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
-    std::vector<DT> lora_a_random_init(w0_num_elements);
-    for (auto &num : lora_a_random_init) {
-      float num_float = dis_lora_a(gen);
-      if (std::is_same<DT, half>::value) {
-        num = __float2half(num_float);
-      } else {
-        num = num_float;
-      }
-    }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
-                              lora_a_random_init.data(),
-                              w0_num_elements * sizeof(DT),
-                              cudaMemcpyHostToDevice,
-                              stream));
-
-    // LoRA_B weight: [rank, out_dim]
-    float stdv_lora_b = 1.0f / sqrt(weight.rank);
-    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
-    std::vector<float> lora_b_random_init(w1_num_elements);
-    for (auto &num : lora_b_random_init) {
-      float num_float = dis_lora_b(gen);
-      if (std::is_same<DT, half>::value) {
-        num = __float2half(num_float);
-      } else {
-        num = num_float;
-      }
+bool lora_applies_to_this_layer(LoraLinearMeta *m,
+                                LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
     }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
-                              lora_b_random_init.data(),
-                              w1_num_elements * sizeof(DT),
-                              cudaMemcpyHostToDevice,
-                              stream));
   }
+  return false;
 }
 
+namespace Internal {
+
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -213,91 +180,60 @@ void inference_kernel(LoraLinearMeta *m,
                       ffStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
   cudaDataType_t lr_actv_type = output_type;
   assert(input_type == output_type);
   cudaDataType_t weight_type = output_type;
   cudaDataType_t compute_type = output_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = output_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->input_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
+
   int num_peft_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
       continue;
     }
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-  }
-  // Assert that we have at most one request that requires peft_bwd
-  assert(num_peft_requests <= 1);
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    // Skip non-PEFT requests
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
+    // std::cout << "Lora layer activated!" << std::endl;
+    // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
+           "Trainable flag mismatch");
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_length;
+    // int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
-           m->model_state.end());
-    LoraLinearWeight weight =
-        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
-    int rank = weight.rank;
-    void *intermediate_result_ptr = nullptr;
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+        bc->requestsInfo[i].peft_model_id, lora_config);
+    void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd)
+                                        ? weight.low_rank_activation
+                                        : m->handle.workSpace;
     if (bc->requestsInfo[i].peft_bwd) {
-      size_t activation_size_needed1 =
-          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
-      size_t activation_size_needed2 =
-          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
-      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
-        m->input_activation =
-            allocator->allocate_instance_untyped(activation_size_needed1);
-        m->allocated_peft_buffer_size1 = activation_size_needed1;
-      }
-      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
-        m->low_rank_activation =
-            allocator->allocate_instance_untyped(activation_size_needed2);
-        m->allocated_peft_buffer_size2 = activation_size_needed2;
-      }
-      // copy input activation
-      checkCUDA(cudaMemcpyAsync(m->input_activation,
+      checkCUDA(cudaMemcpyAsync(weight.input_activation,
                                 input_ptr + first_token_offset * in_dim,
                                 data_type_size(m->input_type[0]) *
                                     num_peft_tokens * in_dim,
                                 cudaMemcpyDeviceToDevice,
                                 stream));
-      intermediate_result_ptr = m->low_rank_activation;
     } else {
       // use workspace to save intermediate result
-      assert(m->handle.workSpaceSize >=
-             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
-      intermediate_result_ptr = m->handle.workSpace;
+      assert(m->handle.workSpaceSize >= data_type_size(m->input_type[1]) *
+                                            num_peft_tokens * lora_config.rank);
     }
+    DT alpha = 1.0f, beta = 0.0f;
     // buffer = weight_first * input
     // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
-                           rank,
+                           lora_config.rank,
                            num_peft_tokens,
                            in_dim,
                            &alpha,
@@ -310,29 +246,27 @@ void inference_kernel(LoraLinearMeta *m,
                            &beta,
                            intermediate_result_ptr,
                            lr_actv_type,
-                           rank,
+                           lora_config.rank,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     // output = weight_second * buffer
     // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
     // Note that we use alpha in both places since we do
     // an in-place update for LoraLinear
-    float lora_alpha =
-        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
-    DT scaling_constant = (DT)(lora_alpha / rank);
+    DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
                            out_dim,
                            num_peft_tokens,
-                           rank,
+                           lora_config.rank,
                            &scaling_constant,
                            weight.w1_ptr,
                            weight_type,
-                           rank,
+                           lora_config.rank,
                            intermediate_result_ptr,
                            lr_actv_type,
-                           rank,
+                           lora_config.rank,
                            &alpha,
                            output_ptr + first_token_offset * out_dim,
                            output_type,
@@ -340,6 +274,7 @@ void inference_kernel(LoraLinearMeta *m,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
+  assert(num_peft_requests <= 1);
 }
 
 template <typename DT>
@@ -371,6 +306,7 @@ void peft_bwd_kernel(Context ctx,
                      Runtime *runtime,
                      LoraLinearMeta *m,
                      BatchConfig const *bc,
+                     int shard_id,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      int in_dim,
@@ -384,39 +320,33 @@ void peft_bwd_kernel(Context ctx,
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
   cudaDataType_t compute_type = output_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = output_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    // Skip non-PEFT requests
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+    // Skip completed, non-PEFT and PEFT forward-only requests
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    // Skip PEFT forward-only requests
-    if (!bc->requestsInfo[i].peft_bwd) {
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
+    // std::cout << "Lora layer activated!" << std::endl;
+    // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
+           "Trainable flag mismatch");
+    m->peft_memory_manager->check_ft_model_id(
+        bc->requestsInfo[i].peft_model_id);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // int max_peft_tokens = bc->requestsInfo[i].max_length;
     // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
-           m->model_state.end());
-    LoraLinearWeight weight =
-        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
-    int rank = weight.rank;
-    float lora_alpha =
-        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
-    DT scaling_constant = (DT)(lora_alpha / rank);
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+        bc->requestsInfo[i].peft_model_id, lora_config);
+    DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
     // Compute LORA_B weight's gradient
     if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
@@ -424,23 +354,35 @@ void peft_bwd_kernel(Context ctx,
       DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
                     ? 0.0f
                     : 1.0f;
+      // std::cout << "Lora B gradient computation, beta = " << (float) beta <<
+      // std::endl;
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation";
+        std::cout << "Save low_rank_activation (" << lora_config.rank << ", "
+                  << num_peft_tokens << ") to " << filename << std::endl;
+        save_tensor(static_cast<const DT *>(weight.low_rank_activation),
+                    lora_config.rank * num_peft_tokens,
+                    filename.c_str());
+      }
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
                              CUBLAS_OP_T,
-                             rank,
+                             lora_config.rank,
                              out_dim,
                              num_peft_tokens,
                              &scaling_constant,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              output_grad_ptr,
                              output_type,
                              out_dim,
                              &beta,
                              weight.w1_grad_ptr,
                              weight_type,
-                             rank,
+                             lora_config.rank,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
@@ -452,20 +394,20 @@ void peft_bwd_kernel(Context ctx,
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
                              CUBLAS_OP_N,
-                             rank,
+                             lora_config.rank,
                              num_peft_tokens,
                              out_dim,
                              &scaling_constant,
                              weight.w1_ptr,
                              weight_type,
-                             rank,
+                             lora_config.rank,
                              output_grad_ptr,
                              output_type,
                              out_dim,
                              &beta,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
@@ -480,15 +422,15 @@ void peft_bwd_kernel(Context ctx,
                              CUBLAS_OP_N,
                              CUBLAS_OP_T,
                              in_dim,
-                             rank,
+                             lora_config.rank,
                              num_peft_tokens,
                              &alpha,
-                             m->input_activation,
+                             weight.input_activation,
                              input_type,
                              in_dim,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              &beta,
                              weight.w0_grad_ptr,
                              weight_type,
@@ -506,14 +448,14 @@ void peft_bwd_kernel(Context ctx,
                              CUBLAS_OP_N,
                              in_dim,
                              num_peft_tokens,
-                             rank,
+                             lora_config.rank,
                              &alpha,
                              weight.w0_ptr,
                              weight_type,
                              in_dim,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              &beta,
                              input_grad_ptr,
                              input_type,
@@ -523,17 +465,16 @@ void peft_bwd_kernel(Context ctx,
     }
 
     if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
-      LoraOptimizerConfig const *optimizer_config =
-          m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config;
-      assert(optimizer_config != nullptr);
-      assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig));
-      int w0_num_elements = rank * in_dim;
-      int w1_num_elements = rank * out_dim;
+      assert(lora_config.optimizer_config != nullptr);
+      int w0_num_elements = lora_config.rank * in_dim;
+      int w1_num_elements = lora_config.rank * out_dim;
 
       // Get optimizer config
-      if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+
+      if (lora_config.optimizer_config->getType() == "SGD") {
         LoraSGDOptimizerConfig const *sgd_config =
-            (LoraSGDOptimizerConfig const *)optimizer_config;
+            static_cast<LoraSGDOptimizerConfig const *>(
+                lora_config.optimizer_config);
         // LoRA_A weight is split in tensor parallelism, so no need to apply
         // all-reduce
         sgd_update<<<GET_BLOCKS(w0_num_elements),
@@ -574,7 +515,7 @@ void peft_bwd_kernel(Context ctx,
                                static_cast<DT const *>(weight.w1_grad_ptr),
                                static_cast<DT *>(weight.w1_v_values_ptr),
                                static_cast<DT *>(weight.w1_ptr));
-      } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) {
+      } else if (lora_config.optimizer_config->getType() == "Adam") {
         assert(false && "Adam optimizer type not implemented yet");
       } else {
         assert(false && "Unsupported optimizer type");
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 09170d3c28..8c2120e283 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -769,6 +769,7 @@ void Linear::peft_bwd_task(Task const *task,
            num_peft_tokens);
   }
   peft_bwd_kernel_wrapper(m,
+                          bc,
                           input_grad.ptr,
                           output_grad.ptr,
                           weight.ptr,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 3749cce994..68605160a5 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -3,6 +3,7 @@
 #include "flexflow/layer.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/request_manager.h"
 #include "flexflow/utils/hash_utils.h"
 #include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
@@ -51,18 +52,18 @@ bool check_lora_layer_match(Layer *potential_target,
   return false;
 }
 
-PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
+void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
-  if (peft_config.target_modules.size() == 0) {
-    printf("PEFT config does not contain any target module\n");
-    std::cout << peft_config << std::endl;
-    assert(false);
-  }
-  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
-  peft_configs[*peft_model_id] = peft_config;
-
-  for (std::string target_module_name : peft_config.target_modules) {
+  assert(target_modules.size() > 0 && "LoRA target module name is empty");
+  RequestManager *rm = RequestManager::get_request_manager();
+  int max_lora_rank = rm->get_max_lora_rank();
+  int max_concurrent_adapters = rm->get_max_concurrent_adapters();
+  assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank");
+  assert(max_concurrent_adapters > 0 &&
+         "Invalid number of LoRA concurrent adapters");
+
+  for (std::string target_module_name : target_modules) {
     assert(target_module_name.length() > 0 &&
            "LoRA target module name is empty");
     // find target layer
@@ -72,127 +73,84 @@ PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
       if (!match) {
         continue;
       }
-
-      if (base_layer_to_peft_layer.find(target_module) !=
-          base_layer_to_peft_layer.end()) {
-        // lora linear layer already added, no need to add again
-        Layer *peft_layer = base_layer_to_peft_layer[target_module];
-        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
-      } else {
-        Tensor const input = target_module->inputs[0];
-        Tensor const output = target_module->outputs[0];
-        assert(input->data_type == output->data_type);
-        std::string name_ = target_module->name
-                                ? std::string(target_module->name)
-                                : std::string("");
-        size_t last_underscore = name_.length() - 1;
-        for (int i = name_.length() - 1; i > 0; i--) {
-          if (!(std::isdigit(target_module->name[i]) ||
-                target_module->name[i] == '_')) {
-            break;
-          } else if (target_module->name[i] == '_') {
-            last_underscore = i;
-          }
+      assert(base_layer_to_peft_layer.find(target_module) ==
+                 base_layer_to_peft_layer.end() &&
+             "LoRA layer already added, attempting to add again");
+      // Get input and output tensors from target module
+      Tensor const input = target_module->inputs[0];
+      Tensor const output = target_module->outputs[0];
+      assert(input->data_type == output->data_type);
+      // Compute OP_LORA layer name, based on target module name
+      std::string name_ = target_module->name ? std::string(target_module->name)
+                                              : std::string("");
+      size_t last_underscore = name_.length() - 1;
+      for (int i = name_.length() - 1; i > 0; i--) {
+        if (!(std::isdigit(target_module->name[i]) ||
+              target_module->name[i] == '_')) {
+          break;
+        } else if (target_module->name[i] == '_') {
+          last_underscore = i;
         }
-        name_.erase(last_underscore);
-
-        name_ += ".lora";
-        std::cout << "Adding layer " << name_ << std::endl;
-        Layer *peft_layer = new Layer(this,
-                                      OP_LORA,
-                                      output->data_type,
-                                      name_.c_str(),
-                                      2 /*inputs*/,
-                                      0 /*weights*/,
-                                      1 /*outputs*/,
-                                      input,
-                                      output);
-        // fix LoRA layer's transformer layer ID and model ID
-        peft_layer->layer_guid.transformer_layer_id =
-            target_module->layer_guid.transformer_layer_id;
-        peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
-        {
-          int numdims = output->num_dims;
-          int dims[MAX_TENSOR_DIM];
-          for (int i = 0; i < numdims; i++) {
-            dims[i] = output->dims[i];
-          }
-          peft_layer->outputs[0] =
-              create_tensor_legion_ordering(numdims,
-                                            dims,
-                                            output->data_type,
-                                            peft_layer,
-                                            0,
-                                            true /*create_grad*/);
+      }
+      name_.erase(last_underscore);
+      name_ += ".lora";
+      std::cout << "Adding layer " << name_ << std::endl;
+      // Create OP_LORA layer given input, output and name
+      Layer *peft_layer = new Layer(this,
+                                    OP_LORA,
+                                    output->data_type,
+                                    name_.c_str(),
+                                    2 /*inputs*/,
+                                    0 /*weights*/,
+                                    1 /*outputs*/,
+                                    input,
+                                    output);
+      // fix LoRA layer's transformer layer ID and model ID (to be the same as
+      // target module)
+      peft_layer->layer_guid.transformer_layer_id =
+          target_module->layer_guid.transformer_layer_id;
+      peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
+      // set up output tensor for OP_LORA layer
+      {
+        int numdims = output->num_dims;
+        int dims[MAX_TENSOR_DIM];
+        for (int i = 0; i < numdims; i++) {
+          dims[i] = output->dims[i];
         }
-        it = layers.insert(it + 1, peft_layer);
-        ++it;
-        base_layer_to_peft_layer[target_module] = peft_layer;
-        peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
-        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+        peft_layer->outputs[0] =
+            create_tensor_legion_ordering(numdims,
+                                          dims,
+                                          output->data_type,
+                                          peft_layer,
+                                          0,
+                                          true /*create_grad*/);
       }
+      // pass max_rank and max_concurrent_adapters to OP_LORA layer
+      peft_layer->add_int_property("max_rank", max_lora_rank);
+      peft_layer->add_int_property("max_concurrent_adapters",
+                                   max_concurrent_adapters);
+      it = layers.insert(it + 1, peft_layer);
+      ++it;
+      base_layer_to_peft_layer[target_module] = peft_layer;
     }
   }
-
-  // save finetuned lora model configs to file
-  if (peft_config.trainable) {
-    std::string finetuned_model_folder = join_path({
-        peft_config.cache_folder,
-        "finetuned_models",
-        peft_config.peft_model_id,
-    });
-    fs::remove_all(finetuned_model_folder);
-    std::string finetuned_model_config_folder = join_path({
-        finetuned_model_folder,
-        "config",
-    });
-    fs::create_directories(finetuned_model_config_folder);
-    std::string lora_linear_config_filepath = join_path({
-        finetuned_model_config_folder,
-        "ff_config.json",
-    });
-    serialize_to_json_file(peft_config, lora_linear_config_filepath);
-    std::string optimizer_config_filepath = join_path({
-        finetuned_model_config_folder,
-        "ff_optimizer_config.json",
-    });
-    if (typeid(*peft_config.optimizer_config) ==
-        typeid(LoraSGDOptimizerConfig)) {
-      LoraSGDOptimizerConfig const *sgd_config =
-          static_cast<LoraSGDOptimizerConfig const *>(
-              peft_config.optimizer_config);
-      serialize_to_json_file(*sgd_config, optimizer_config_filepath);
-    } else if (typeid(*peft_config.optimizer_config) ==
-               typeid(LoraAdamOptimizerConfig)) {
-      LoraAdamOptimizerConfig const *adam_config =
-          static_cast<LoraAdamOptimizerConfig const *>(
-              peft_config.optimizer_config);
-      serialize_to_json_file(*adam_config, optimizer_config_filepath);
-    } else {
-      assert(false && "Optimizer not supported");
-    }
-  }
-
-  return peft_model_id;
 }
 
 Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
-  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
-  std::vector<PEFTModelID> const &peft_ids =
-      model.peft_layer_to_peft_id[(Layer *)layer];
-  for (int i = 0; i < peft_ids.size(); i++) {
-    _peft_configs.emplace(
-        std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
-  }
+  long long value;
+  layer->get_int_property("max_rank", value);
+  int max_rank = value;
+  layer->get_int_property("max_concurrent_adapters", value);
+  int max_concurrent_adapters = value;
   return new LoraLinear(model,
                         layer->layer_guid,
-                        layer->op_type,
                         inputs[0],
                         inputs[1],
-                        _peft_configs,
+                        max_rank,
+                        max_concurrent_adapters,
                         layer->name);
 }
 
@@ -202,10 +160,10 @@ LoraLinear::LoraLinear(FFModel &model,
                        ParallelTensor const output)
     : LoraLinear(model,
                  other.layer_guid,
-                 other.op_type,
                  input,
                  output,
-                 other.peft_configs,
+                 other.max_rank,
+                 other.max_concurrent_adapters,
                  other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
@@ -214,22 +172,23 @@ LoraLinear::LoraLinear(FFModel &model,
                        char const *name)
     : LoraLinear(model,
                  params.layer_guid,
-                 params.type,
                  inputs.first,
                  inputs.second,
-                 params.peft_configs,
+                 params.max_rank,
+                 params.max_concurrent_adapters,
                  params.name) {}
 
 LoraLinear::LoraLinear(
     FFModel &model,
     LayerID const &_layer_guid,
-    OperatorType _op_type,
     ParallelTensor const _input,
     ParallelTensor const _output,
-    std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+    int _max_rank,
+    int _max_concurrent_adapters,
+    // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
     char const *name)
     : Op(model,
-         _op_type,
+         OP_LORA,
          _output->data_type,
          name,
          2 /*inputs*/,
@@ -256,9 +215,11 @@ LoraLinear::LoraLinear(
     outputs[0] = model.create_parallel_tensor_legion_ordering(
         numdim, dims, inputs[1]->data_type, this);
   }
-  for (auto const &kv : _peft_configs) {
-    peft_configs.insert(kv);
-  }
+  // for (auto const &kv : _peft_configs) {
+  //   peft_configs.insert(kv);
+  // }
+  max_rank = _max_rank;
+  max_concurrent_adapters = _max_concurrent_adapters;
   // assert(check_output_input_weight_parallel_dims(allocate_weights));
 }
 
@@ -313,56 +274,6 @@ void LoraLinear::init_inference(
   set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
 }
 
-template <typename DT>
-void load_peft_from_file(DT *ptr,
-                         size_t num_rows,
-                         size_t num_columns,
-                         int num_shards,
-                         int shard_id,
-                         std::string filepath) {
-  std::ifstream in(filepath, std::ios::in | std::ios::binary);
-  if (!in.good()) {
-    printf("Could not open file: %s\n", filepath.c_str());
-  }
-  assert(in.good() && "incorrect weight file path");
-
-  // HuggingFace dims (serialized in row-major order)
-  //    lora_A: [rank, intermediate_dim]
-  //    lora_B: [hidden_dim, rank]
-  // FlexFlow dims (serialized in column-major order)
-  //    lora_A: [intermediate_dim, rank]
-  //    lora_B: [rank, out_dim]
-  // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B
-  assert(num_rows % num_shards == 0);
-  size_t chunk_size = num_rows / num_shards;
-  size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0;
-
-  // Allocate memory for the weight shard
-  std::vector<DT> host_array(chunk_size * num_columns);
-  // Read the chunk
-  size_t total_size_read = 0;
-  for (int i = 0; i < num_columns; ++i) {
-    in.seekg((i * num_rows + offset) * sizeof(DT));
-    in.read(reinterpret_cast<char *>(host_array.data() + i * chunk_size),
-            chunk_size * sizeof(DT));
-    total_size_read += in.gcount();
-  }
-  // Check weight shard size
-  size_t expected_data_size = chunk_size * num_columns * sizeof(DT);
-  if (total_size_read != expected_data_size) {
-    printf("load weight data error: expected %lu bytes, got: %lu bytes, data "
-           "size: %lu\n",
-           expected_data_size,
-           total_size_read,
-           sizeof(DT));
-    assert(false);
-  }
-  assert(host_array.size() == chunk_size * num_columns);
-  // Copy weight to device memory
-  copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns);
-  in.close();
-}
-
 /*
   regions[0](O): output
   regions[1](I): kernel
@@ -428,162 +339,20 @@ OpMeta *LoraLinear::init_task(Task const *task,
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
 
-  for (auto const &kv : lora->peft_configs) {
-    PEFTModelID const &model_id = kv.first;
-    LoraLinearConfig const &lora_config = kv.second;
-
-    int rank = lora_config.rank;
-
-    int w0_num_elements = rank * in_dim;
-    int w1_num_elements = rank * out_dim;
-    // values below represent total weight sizes before sharding. Lora B is not
-    // sharded.
-    int lora_A_num_rows = in_dim * num_shards;
-    int lora_A_num_cols = rank;
-    int lora_B_num_rows = rank;
-    int lora_B_num_cols = out_dim;
-    int lora_A_num_shards = num_shards;
-    int lora_B_num_shards = 1;
-
-    LoraLinearWeight weight;
-    weight.in_dim = in_dim;
-    weight.out_dim = out_dim;
-    weight.rank = rank;
-    weight.num_shards = num_shards;
-    PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
-    weight.w0_ptr = allocator->allocate_local_weights_untyped(
-        model_id, w0_num_elements * data_type_size(dt));
-    weight.w1_ptr = allocator->allocate_local_weights_untyped(
-        model_id, w1_num_elements * data_type_size(dt));
-
-    if (!lora_config.init_lora_weights) {
-      // load weights from file
-      std::string weights_folder_filepath = join_path({
-          lora_config.cache_folder,
-          "weights",
-          lora_config.peft_model_id,
-          dt == DT_FLOAT ? "full-precision" : "half-precision",
-      });
-      std::string w0_filepath = join_path(
-          {weights_folder_filepath, lora_layername_substr + "_A.weight"});
-      std::string w1_filepath = join_path(
-          {weights_folder_filepath, lora_layername_substr + "_B.weight"});
-      if (dt == DT_FLOAT) {
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_A.weight"
-                  << ", num_rows: " << lora_A_num_rows
-                  << ", num_cols: " << lora_A_num_cols
-                  << ", num_shards: " << lora_A_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((float *)weight.w0_ptr,
-                            lora_A_num_rows,
-                            lora_A_num_cols,
-                            lora_A_num_shards,
-                            shard_id,
-                            w0_filepath);
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_B.weight"
-                  << ", num_rows: " << lora_B_num_rows
-                  << ", num_cols: " << lora_B_num_cols
-                  << ", num_shards: " << lora_B_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((float *)weight.w1_ptr,
-                            lora_B_num_rows,
-                            lora_B_num_cols,
-                            lora_B_num_shards,
-                            shard_id,
-                            w1_filepath);
-      } else if (dt == DT_HALF) {
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_A.weight"
-                  << ", num_rows: " << lora_A_num_rows
-                  << ", num_cols: " << lora_A_num_cols
-                  << ", num_shards: " << lora_A_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((half *)weight.w0_ptr,
-                            lora_A_num_rows,
-                            lora_A_num_cols,
-                            lora_A_num_shards,
-                            shard_id,
-                            w0_filepath);
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_B.weight"
-                  << ", num_rows: " << lora_B_num_rows
-                  << ", num_cols: " << lora_B_num_cols
-                  << ", num_shards: " << lora_B_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((half *)weight.w1_ptr,
-                            lora_B_num_rows,
-                            lora_B_num_cols,
-                            lora_B_num_shards,
+  // allocate space for lora weights
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
+  m->peft_memory_manager =
+      new PEFTMemoryManager(gpu_mem,
+                            lora->max_rank,
+                            lora->max_concurrent_adapters,
+                            BatchConfig::max_sequence_length(),
+                            in_dim,
+                            out_dim,
+                            num_shards,
                             shard_id,
-                            w1_filepath);
-      } else {
-        assert(false && "Data type not supported");
-      }
-    } else {
-      // initialize weights
-      int seed = 0;
-      init_kernel_wrapper(m, seed);
-    }
-
-    // allocate space for gradients if the LoRA layer is trainable
-    if (lora_config.trainable) {
-      // Ensure we have an optimizer
-      assert(lora_config.optimizer_config != nullptr && "Optimizer not set");
-      assert(typeid(*lora_config.optimizer_config) !=
-                 typeid(LoraOptimizerConfig) &&
-             "Optimizer config is not a subclass of LoraOptimizerConfig");
-      if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
-        // Input is partitioned (no replication)
-        // w0_grad is local weight gradients
-        weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
-            model_id, w0_num_elements * data_type_size(dt));
-        // w1_grad is sync weight gradients
-        weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
-            model_id, w1_num_elements * data_type_size(dt));
-      } else {
-        // Input is replicated
-        // w0_grad is sync weight gradients
-        weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
-            model_id, w0_num_elements * data_type_size(dt));
-        // w1_grad is local weight gradients
-        weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
-            model_id, w1_num_elements * data_type_size(dt));
-      }
-      // allocate space for v_values if needed by optimizer
-      if (typeid(*lora_config.optimizer_config) ==
-          typeid(LoraSGDOptimizerConfig)) {
-        LoraSGDOptimizerConfig const *sgd_config =
-            static_cast<LoraSGDOptimizerConfig const *>(
-                lora_config.optimizer_config);
-        if (sgd_config->momentum > 0.0f) {
-          if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
-            weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped(
-                model_id, w0_num_elements * data_type_size(dt));
-            weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped(
-                model_id, w1_num_elements * data_type_size(dt));
-          } else {
-            weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped(
-                model_id, w0_num_elements * data_type_size(dt));
-            weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped(
-                model_id, w1_num_elements * data_type_size(dt));
-          }
-        }
-      } else if (typeid(*lora_config.optimizer_config) ==
-                 typeid(LoraAdamOptimizerConfig)) {
-        assert(false && "Adam optim not yet implemented");
-      } else {
-        assert(false && "Optimizer not supported");
-      }
-    }
-    assert(m->model_state.find(model_id) == m->model_state.end());
-    m->model_state[model_id].weights = weight;
-    m->model_state[model_id].optimizer_config = lora_config.optimizer_config;
-    m->model_state[model_id].lora_alpha = lora_config.lora_alpha;
-    m->model_state[model_id].cache_folder = lora_config.cache_folder;
-    m->model_state[model_id].peft_model_id = lora_config.peft_model_id;
-  }
+                            lora_layername_substr,
+                            dt);
+  m->peft_memory_manager->allocate_inference_memory();
   return m;
 }
 
@@ -655,8 +424,8 @@ void LoraLinear::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
 
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
@@ -707,12 +476,20 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    int rank, num_tokens;
-    for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
-      PEFTModelID peft_model_id = it->first;
-      LoraLinearWeight weight = m->model_state[peft_model_id].weights;
-      rank = weight.rank;
-      num_tokens = input.domain.get_volume() / weight.in_dim;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i] ||
+          bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      std::string peft_model_config_str =
+          std::string(bc->requestsInfo[i].peft_model_config_str);
+      LoraLinearConfig lora_config =
+          LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+      if (!lora_applies_to_this_layer(m, lora_config)) {
+        continue;
+      }
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+          bc->requestsInfo[i].peft_model_id, lora_config);
       fs::path dst_filepath_weights =
           get_dst_folder("weights", m->decoding_step, shard_id) / layername;
       std::string filenameA =
@@ -721,21 +498,38 @@ void LoraLinear::inference_task(Task const *task,
           dst_filepath_weights.string() + ".weight_B.original";
       if (m->input_type[0] == DT_FLOAT) {
         save_tensor((float *)weight.w0_ptr,
-                    weight.rank * weight.in_dim,
+                    lora_config.rank * in_dim,
                     filenameA.c_str());
         save_tensor((float *)weight.w1_ptr,
-                    weight.rank * weight.out_dim,
+                    lora_config.rank * out_dim,
                     filenameB.c_str());
       } else if (m->input_type[0] == DT_HALF) {
         save_tensor((half *)weight.w0_ptr,
-                    weight.rank * weight.in_dim,
+                    lora_config.rank * in_dim,
                     filenameA.c_str());
         save_tensor((half *)weight.w1_ptr,
-                    weight.rank * weight.out_dim,
+                    lora_config.rank * out_dim,
                     filenameB.c_str());
       } else {
         assert(false && "Data type not supported");
       }
+
+      if (bc->requestsInfo[i].peft_bwd) {
+        int num_tokens = input.domain.get_volume() / in_dim;
+        // input activation (intermediate)
+        filename = dst_filepath.string() + ".low_rank_activation";
+        if (output.data_type == DT_FLOAT) {
+          save_tensor((float *)weight.low_rank_activation,
+                      lora_config.rank * num_tokens,
+                      filename.c_str());
+        } else if (output.data_type == DT_HALF) {
+          save_tensor((half *)weight.low_rank_activation,
+                      lora_config.rank * num_tokens,
+                      filename.c_str());
+        } else {
+          assert(false);
+        }
+      }
     }
 
     filename = dst_filepath.string() + ".output_0";
@@ -749,21 +543,6 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    if (bc->num_active_peft_tokens() > 0) {
-      // input activation (intermediate)
-      filename = dst_filepath.string() + ".low_rank_activation";
-      if (output.data_type == DT_FLOAT) {
-        save_tensor((float *)m->low_rank_activation,
-                    rank * num_tokens,
-                    filename.c_str());
-      } else if (output.data_type == DT_HALF) {
-        save_tensor((half *)m->low_rank_activation,
-                    rank * num_tokens,
-                    filename.c_str());
-      } else {
-        assert(false);
-      }
-    }
     m->decoding_step++;
   }
 }
@@ -819,6 +598,8 @@ void lora_inference_debugging(LoraLinearMeta *m,
                               GenericTensorAccessorW input_grad,
                               GenericTensorAccessorR output_grad,
                               int shard_id) {
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   // get layer name
   std::string lora_layername = std::string(m->op_name);
   std::string searchString = "lora";
@@ -852,10 +633,22 @@ void lora_inference_debugging(LoraLinearMeta *m,
   // weights, weights gradients
   fs::path dst_filepath_weights =
       get_dst_folder("weights", m->bwd_step, shard_id) / layername;
-  assert(m->model_state.size() >= 1 && "Model state empty!");
-  for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
-    PEFTModelID peft_model_id = it->first;
-    LoraLinearWeight weight = m->model_state[peft_model_id].weights;
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
+      continue;
+    }
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+        bc->requestsInfo[i].peft_model_id, lora_config);
     std::string filename_weight_A =
         dst_filepath_weights.string() + ".weight_A.finetuned";
     std::string filename_weight_B =
@@ -867,36 +660,36 @@ void lora_inference_debugging(LoraLinearMeta *m,
     if (m->input_type[0] == DT_FLOAT) {
       // weight A
       save_tensor((float *)weight.w0_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_weight_A.c_str());
       // weight grad A
       save_tensor((float *)weight.w0_grad_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_grad_A.c_str());
       // weight B
       save_tensor((float *)weight.w1_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_weight_B.c_str());
       // weight grad B
       save_tensor((float *)weight.w1_grad_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_grad_B.c_str());
     } else if (m->input_type[0] == DT_HALF) {
       // weight A
       save_tensor((half *)weight.w0_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_weight_A.c_str());
       // weight grad A
       save_tensor((half *)weight.w0_grad_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_grad_A.c_str());
       // weight B
       save_tensor((half *)weight.w1_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_weight_B.c_str());
       // weight grad B
       save_tensor((half *)weight.w1_grad_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_grad_B.c_str());
     } else {
       assert(false && "Data type not supported");
@@ -975,62 +768,50 @@ void save_peft_weights_if_needed(LoraLinearMeta *m,
   }
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    // Skip non-PEFT requests
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    // Skip PEFT forward-only requests
-    if (!bc->requestsInfo[i].peft_bwd) {
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
     if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) {
-      assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
-             m->model_state.end());
       std::string weight_export_folder = join_path({
-          m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder,
+          lora_config.cache_folder,
           "finetuned_models",
-          m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id,
+          lora_config.peft_model_id,
           "weights",
           "shard_" + std::to_string(shard_id),
       });
       fs::create_directories(weight_export_folder);
 
-      int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank;
+      int rank = lora_config.rank;
       int w0_num_elements = rank * in_dim;
       int w1_num_elements = rank * out_dim;
       std::string w0_filepath = join_path(
           {weight_export_folder, lora_layername_substr + "_A.weight"});
       std::string w1_filepath = join_path(
           {weight_export_folder, lora_layername_substr + "_B.weight"});
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+          bc->requestsInfo[i].peft_model_id, lora_config);
       if (m->input_type[0] == DT_FLOAT) {
-        save_peft_to_file(
-            (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                .weights.w0_ptr,
-            w0_num_elements,
-            w0_filepath);
+        save_peft_to_file((float *)weight.w0_ptr, w0_num_elements, w0_filepath);
         if (shard_id == 0) {
           save_peft_to_file(
-              (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                  .weights.w1_ptr,
-              w1_num_elements,
-              w1_filepath);
+              (float *)weight.w1_ptr, w1_num_elements, w1_filepath);
         }
       } else if (m->input_type[0] == DT_HALF) {
-        save_peft_to_file(
-            (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                .weights.w0_ptr,
-            w0_num_elements,
-            w0_filepath);
+        save_peft_to_file((half *)weight.w0_ptr, w0_num_elements, w0_filepath);
         if (shard_id == 0) {
           save_peft_to_file(
-              (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                  .weights.w1_ptr,
-              w1_num_elements,
-              w1_filepath);
+              (half *)weight.w1_ptr, w1_num_elements, w1_filepath);
         }
       } else {
         assert(false && "Data type not supported");
@@ -1065,7 +846,8 @@ void LoraLinear::peft_bwd_task(Task const *task,
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
-  peft_bwd_kernel_wrapper(ctx, runtime, m, bc, input_grad, output_grad);
+  peft_bwd_kernel_wrapper(
+      ctx, runtime, m, bc, shard_id, input_grad, output_grad);
 
   save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id);
 
@@ -1098,14 +880,9 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type &&
-      lhs.peft_configs.size() == rhs.peft_configs.size()) {
-    for (auto const &kv : lhs.peft_configs) {
-      auto it = rhs.peft_configs.find(kv.first);
-      if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
-        return false;
-      }
-    }
+  if (lhs.layer_guid == rhs.layer_guid && lhs.max_rank == rhs.max_rank &&
+      lhs.max_concurrent_adapters == rhs.max_concurrent_adapters &&
+      strcmp(lhs.name, rhs.name) == 0) {
     return true;
   }
   return false;
@@ -1144,48 +921,8 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
-  sez.serialize(this->op_type);
-  sez.serialize(this->peft_configs.size());
-  for (auto const &kv : this->peft_configs) {
-    // Serialize PEFTModelID
-    sez.serialize(kv.first.id);
-
-    // Serialize LoraLinearConfig and OptimizerConfig to tmp folder
-    // 1. Create tmp dir and serialize it
-    fs::path unique_temp_dir = create_unique_temp_directory();
-    serialize_string(sez, unique_temp_dir.string());
-    // 2. Dump LoraLinearConfig to json file in tmp dir
-    std::string lora_config_filename = std::string("lora_linear_config_") +
-                                       std::to_string(kv.first.id) +
-                                       std::string(".json");
-    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
-    serialize_to_json_file(kv.second, lora_config_json_filepath);
-    // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type
-    std::string optimizer_filename = std::string("optimizer_config_") +
-                                     std::to_string(kv.first.id) +
-                                     std::string(".json");
-    fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
-    assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr));
-    if (kv.second.trainable) {
-      if (typeid(*kv.second.optimizer_config) ==
-          typeid(LoraSGDOptimizerConfig)) {
-        sez.serialize(OPTIMIZER_TYPE_SGD);
-        LoraSGDOptimizerConfig const *sgd_config =
-            static_cast<LoraSGDOptimizerConfig const *>(
-                kv.second.optimizer_config);
-        serialize_to_json_file(*sgd_config, optim_config_filepath);
-      } else if (typeid(*kv.second.optimizer_config) ==
-                 typeid(LoraAdamOptimizerConfig)) {
-        sez.serialize(OPTIMIZER_TYPE_ADAM);
-        LoraAdamOptimizerConfig const *adam_config =
-            static_cast<LoraAdamOptimizerConfig const *>(
-                kv.second.optimizer_config);
-        serialize_to_json_file(*adam_config, optim_config_filepath);
-      } else {
-        assert(false && "Optimizer type not yet supported");
-      }
-    }
-  }
+  sez.serialize(this->max_rank);
+  sez.serialize(this->max_concurrent_adapters);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -1198,8 +935,9 @@ Node LoraLinear::deserialize(FFModel &ff,
                              int num_inputs) {
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
-  OperatorType op_type;
-  size_t num_pefts;
+  int max_rank, max_concurrent_adapters;
+  // OperatorType op_type;
+  // size_t num_pefts;
   size_t name_len;
   char name[MAX_OPNAME] = {0};
 
@@ -1208,62 +946,16 @@ Node LoraLinear::deserialize(FFModel &ff,
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
-  dez.deserialize(op_type);
-  dez.deserialize(num_pefts);
-  for (int i = 0; i < num_pefts; i++) {
-    // Deserialize PEFTModelID
-    size_t pid;
-    dez.deserialize(pid);
-    PEFTModelID peft_model_id(pid);
-    // Deserialize tmp folder containing LoraLinearConfig and optimizer config
-    fs::path unique_temp_dir = fs::path(deserialize_string(dez));
-    // 1. Deserialize LoraLinearConfig
-    std::string lora_config_filename = std::string("lora_linear_config_") +
-                                       std::to_string(pid) +
-                                       std::string(".json");
-    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
-    std::unique_ptr<LoraLinearConfig> lora_linear_config =
-        deserialize_from_json_file<LoraLinearConfig>(lora_config_json_filepath);
-    // 2. Deserialize optimizer if needed
-    if (lora_linear_config->trainable) {
-      std::string optimizer_filename = std::string("optimizer_config_") +
-                                       std::to_string(pid) +
-                                       std::string(".json");
-      fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
-      OptimizerType type_;
-      dez.deserialize(type_);
-      if (type_ == OPTIMIZER_TYPE_SGD) {
-        std::unique_ptr<LoraSGDOptimizerConfig> sgd_optimizer_config =
-            deserialize_from_json_file<LoraSGDOptimizerConfig>(
-                optim_config_filepath);
-        lora_linear_config->optimizer_config =
-            dynamic_cast<LoraOptimizerConfig *>(sgd_optimizer_config.release());
-      } else if (type_ == OPTIMIZER_TYPE_ADAM) {
-        std::unique_ptr<LoraAdamOptimizerConfig> adam_optimizer_config =
-            deserialize_from_json_file<LoraAdamOptimizerConfig>(
-                optim_config_filepath);
-        lora_linear_config->optimizer_config =
-            dynamic_cast<LoraOptimizerConfig *>(
-                adam_optimizer_config.release());
-      } else {
-        printf("Optimizer type: %d\n", type_);
-        assert(false && "Optimizer type not yet supported");
-      }
-    }
-    try {
-      fs::remove_all(unique_temp_dir);
-    } catch (fs::filesystem_error const &e) {
-      std::cerr << "Error removing tmp directory: " << e.what() << std::endl;
-    }
-    params.peft_configs.emplace(
-        std::make_pair(peft_model_id, *lora_linear_config));
-  }
+  dez.deserialize(max_rank);
+  dez.deserialize(max_concurrent_adapters);
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
   params.layer_guid = layer_guid;
-  params.type = op_type;
+  // params.type = op_type;
+  params.max_rank = max_rank;
+  params.max_concurrent_adapters = max_concurrent_adapters;
   strcpy(params.name, name);
   return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
 }
@@ -1278,11 +970,13 @@ Op *LoraLinear::materialize(FFModel &ff,
 LoraLinearParams LoraLinear::get_params() const {
   LoraLinearParams params;
   params.layer_guid = this->layer_guid;
-  params.type = this->op_type;
+  params.max_rank = this->max_rank;
+  params.max_concurrent_adapters = this->max_concurrent_adapters;
+  // params.type = this->op_type;
   if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
-  params.peft_configs = this->peft_configs;
+  // params.peft_configs = this->peft_configs;
   return params;
 }
 
@@ -1301,17 +995,8 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
-  for (auto const &kv : params.peft_configs) {
-    hash_combine(key, kv.first.id);
-    hash_combine(key, kv.second.rank);
-    hash_combine(key, kv.second.trainable);
-    hash_combine(key, kv.second.cache_folder);
-    hash_combine(key, kv.second.peft_model_id);
-    hash_combine(key, kv.second.lora_alpha);
-    hash_combine(key, kv.second.lora_dropout);
-    hash_combine(key, kv.second.target_modules);
-    hash_combine(key, kv.second.init_lora_weights);
-  }
+  hash_combine(key, params.max_rank);
+  hash_combine(key, params.max_concurrent_adapters);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 6e0c60e057..69c0081ec9 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -12,6 +12,17 @@ namespace FlexFlow {
 // empty optimizer
 LoraOptimizerConfig::LoraOptimizerConfig() {}
 
+LoraOptimizerConfig *LoraOptimizerConfig::fromJson(nlohmann::json const &j) {
+  std::string type = j["type"];
+  if (type == "SGD") {
+    return LoraSGDOptimizerConfig::fromJson(j);
+  }
+  if (type == "Adam") {
+    return LoraAdamOptimizerConfig::fromJson(j);
+  }
+  throw std::runtime_error("Unknown optimizer type");
+}
+
 // SGD optimizer
 LoraSGDOptimizerConfig::LoraSGDOptimizerConfig()
     : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {}
@@ -30,6 +41,24 @@ std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) {
   return os;
 }
 
+nlohmann::json LoraSGDOptimizerConfig::toJson() const {
+  return {{"type", "SGD"},
+          {"lr", lr},
+          {"momentum", momentum},
+          {"nesterov", nesterov},
+          {"weight_decay", weight_decay}};
+}
+
+LoraSGDOptimizerConfig *
+    LoraSGDOptimizerConfig::fromJson(nlohmann::json const &j) {
+  LoraSGDOptimizerConfig *sgd = new LoraSGDOptimizerConfig();
+  sgd->lr = j["lr"];
+  sgd->momentum = j["momentum"];
+  sgd->nesterov = j["nesterov"];
+  sgd->weight_decay = j["weight_decay"];
+  return sgd;
+}
+
 // Adam optimizer
 LoraAdamOptimizerConfig::LoraAdamOptimizerConfig()
     : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f),
@@ -50,38 +79,26 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) {
   return os;
 }
 
-// Serialization helpers
-template <typename T>
-void serialize_to_json_file(T const &obj, fs::path const &filepath) {
-  json j = obj;
-  std::ofstream file(filepath);
-  file << j.dump(4);
+nlohmann::json LoraAdamOptimizerConfig::toJson() const {
+  return {{"type", "Adam"},
+          {"alpha", alpha},
+          {"beta1", beta1},
+          {"beta2", beta2},
+          {"weight_decay", weight_decay},
+          {"epsilon", epsilon}};
 }
 
-template <typename T>
-std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath) {
-  std::ifstream file(filepath);
-  json j;
-  file >> j;
-  return std::make_unique<T>(j.get<T>());
+LoraAdamOptimizerConfig *
+    LoraAdamOptimizerConfig::fromJson(nlohmann::json const &j) {
+  LoraAdamOptimizerConfig *adam = new LoraAdamOptimizerConfig();
+  adam->alpha = j["alpha"];
+  adam->beta1 = j["beta1"];
+  adam->beta2 = j["beta2"];
+  adam->weight_decay = j["weight_decay"];
+  adam->epsilon = j["epsilon"];
+  return adam;
 }
 
-template void
-    serialize_to_json_file<LoraLinearConfig>(LoraLinearConfig const &obj,
-                                             fs::path const &filepath);
-template void serialize_to_json_file<LoraSGDOptimizerConfig>(
-    LoraSGDOptimizerConfig const &obj, fs::path const &filepath);
-template void serialize_to_json_file<LoraAdamOptimizerConfig>(
-    LoraAdamOptimizerConfig const &obj, fs::path const &filepath);
-template std::unique_ptr<LoraLinearConfig>
-    deserialize_from_json_file<LoraLinearConfig>(fs::path const &filepath);
-template std::unique_ptr<LoraSGDOptimizerConfig>
-    deserialize_from_json_file<LoraSGDOptimizerConfig>(
-        fs::path const &filepath);
-template std::unique_ptr<LoraAdamOptimizerConfig>
-    deserialize_from_json_file<LoraAdamOptimizerConfig>(
-        fs::path const &filepath);
-
 // ------------------ LoRA configs -------------------
 // ---------------------------------------------------
 const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", "");
@@ -218,4 +235,76 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   return os;
 }
 
-}; // namespace FlexFlow
+double ToThreeDecimalPlaces(float f) {
+  double d = static_cast<double>(f);
+  int i;
+  if (d >= 0) {
+    i = static_cast<int>(d * 1000 + 0.5);
+  } else {
+    i = static_cast<int>(d * 1000 - 0.5);
+  }
+  return (i / 1000.0);
+}
+
+std::string LoraLinearConfig::serialize_to_json_string(int indent) const {
+  nlohmann::json j = {{"cache_folder", cache_folder},
+                      {"peft_model_id", peft_model_id},
+                      {"rank", rank},
+                      {"lora_alpha", ToThreeDecimalPlaces(lora_alpha)},
+                      {"lora_dropout", ToThreeDecimalPlaces(lora_dropout)},
+                      {"target_modules", target_modules},
+                      {"trainable", trainable},
+                      {"init_lora_weights", init_lora_weights},
+                      {"base_model_name_or_path", base_model_name_or_path},
+                      {"precision", precision},
+                      {"optimizer_config",
+                       optimizer_config
+                           ? nlohmann::json(optimizer_config->toJson())
+                           : nlohmann::json()}};
+
+  return j.dump(indent); // No indentation
+}
+
+void LoraLinearConfig::serialize_to_json_file(
+    std::string const &filename) const {
+  std::string j = serialize_to_json_string(4);
+  std::ofstream file(filename);
+  file << j;
+}
+
+// Deserialization method
+LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
+    std::string const &json_string) {
+  // std::cout << "Attempting to deserialize from JSON string: " << json_string
+  //           << std::endl;
+  nlohmann::json j = nlohmann::json::parse(json_string);
+  LoraOptimizerConfig *optimizer_config_ = nullptr;
+  if (!j["optimizer_config"].is_null()) {
+    optimizer_config_ = LoraOptimizerConfig::fromJson(j["optimizer_config"]);
+  }
+  LoraLinearConfig config = LoraLinearConfig::EmptyConfig;
+  config.cache_folder = j["cache_folder"].get<std::string>();
+  config.peft_model_id = j["peft_model_id"].get<std::string>();
+  config.rank = j["rank"].get<int>();
+  config.lora_alpha = j["lora_alpha"].get<float>();
+  config.lora_dropout = j["lora_dropout"].get<float>();
+  config.target_modules = j["target_modules"].get<std::vector<std::string>>();
+  config.trainable = j["trainable"].get<bool>();
+  config.init_lora_weights = j["init_lora_weights"].get<bool>();
+  config.base_model_name_or_path =
+      j["base_model_name_or_path"].get<std::string>();
+  config.precision = j["precision"].get<std::string>();
+  config.optimizer_config = optimizer_config_;
+  return config;
+}
+
+// Deserialization method
+LoraLinearConfig
+    LoraLinearConfig::deserialize_from_json_file(std::string const &filename) {
+  std::ifstream file(filename);
+  std::string j;
+  file >> j;
+  return deserialize_from_json_string(j);
+}
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 8213726e8a..31937cef66 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -46,6 +46,10 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) {
+  return !(lhs == rhs);
+}
+
 std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
   if (peft_model_id == PEFTModelID::NO_ID) {
     os << "NO_ID";
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index e73893475c..3ebe6cf095 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -16,6 +16,7 @@
 #include "flexflow/utils/file_loader.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/model.h"
 
 #include <vector>
 using namespace std;
@@ -851,35 +852,70 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   delete data;
 }
 
-void FileDataLoader::load_weights(FFModel *ff) {
+void FileDataLoader::load_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+
+  switch (args->data_type) {
+    case DT_HALF: {
+      args->loader->load_single_weight_tensor<half>(
+          args->ff, args->layer, args->weight_idx);
+      break;
+    }
+    case DT_FLOAT: {
+      args->loader->load_single_weight_tensor<float>(
+          args->ff, args->layer, args->weight_idx);
+      break;
+    }
+    case DT_INT4:
+    case DT_INT8: {
+      args->loader->load_quantization_weight(
+          args->ff, args->layer, args->weight_idx);
+      break;
+    }
+    default:
+      assert(false && "Unsupported data type");
+  }
+}
+
+void FileDataLoader::load_weights_parallel(FFModel *ff,
+                                           Context ctx,
+                                           Runtime *runtime) {
+  std::vector<Future> futures;
+
   for (Layer *l : ff->layers) {
     if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
       continue;
     }
+
     for (int i = 0; i < l->numWeights; i++) {
       Tensor weight = l->weights[i];
       if (weight == NULL) {
         continue;
       }
-      // TODO: currently skip Lora layers
+
       if (l->op_type == OP_LORA) {
         continue;
       }
-      switch (weight->data_type) {
-        case DT_HALF:
-          load_single_weight_tensor<half>(ff, l, i);
-          break;
-        case DT_FLOAT:
-          load_single_weight_tensor<float>(ff, l, i);
-          break;
-        case DT_INT4:
-        case DT_INT8:
-          // load weights in quantization
-          load_quantization_weight(ff, l, i);
-          break;
-        default:
-          assert(false && "Unsupported data type");
+
+      if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF &&
+          weight->data_type != DT_INT4 && weight->data_type != DT_INT8) {
+        assert(false && "Unsupported data type");
       }
+
+      // Create task arguments
+      WeightLoadTaskArgs args(ff, this, l, i, weight->data_type);
+      TaskLauncher launcher(LOAD_WEIGHT_TASK_ID,
+                            TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+      futures.push_back(runtime->execute_task(ctx, launcher));
     }
   }
+
+  // Wait for all tasks to complete
+  for (Future &f : futures) {
+    f.get_void_result();
+  }
 }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index f39ea91f28..45b6ba0db8 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -273,7 +273,9 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
         }
         reset_inputs.insert(op->inputs[i]->region);
       } else {
-        reset_inputs.insert(op->inputs[i]->region);
+        if (op->op_type != OP_LORA) {
+          reset_inputs.insert(op->inputs[i]->region);
+        }
       }
     }
   }
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 417cd2c056..2a95caf6cb 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1550,8 +1550,6 @@ FFRuntime::FFRuntime(FFConfig &config) {
         config.cpu_offload ? config.offload_reserve_space_size : 0;
     info.peft_activation_reserve_space_size =
         config.enable_peft ? config.peft_activation_reserve_space_size : 0;
-    info.peft_weight_reserve_space_size =
-        config.enable_peft ? config.peft_weight_reserve_space_size : 0;
     info.quantization_type = config.quantization_type;
     info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion;
     argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo)));
@@ -3423,62 +3421,29 @@ bool FFModel::need_to_add_combine(int layer_idx) const {
 bool FFModel::need_to_add_allreduce(int layer_idx) const {
   auto const &l = layers[layer_idx];
   if (config.computationMode == COMP_MODE_INFERENCE &&
-      config.tensor_parallelism_degree > 1 &&
-      (
-          //  l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-          //  l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-          (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
-          // mlp layer
-          is_mlp_block(layer_idx) ||
-          // llama mlp layer
-          (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-           layers[layer_idx - 1]->op_type == OP_GELU &&
-           layers[layer_idx - 2]->op_type == OP_LINEAR) ||
-          // LLAMA without element-wise operator fusion
-          (l->op_type == OP_LINEAR && layer_idx >= 5 &&
-           layers[layer_idx - 1]->op_type == OP_EW_MUL &&
-           layers[layer_idx - 2]->op_type == OP_EW_MUL &&
-           layers[layer_idx - 3]->op_type == OP_SIGMOID &&
-           layers[layer_idx - 4]->op_type == OP_LINEAR &&
-           layers[layer_idx - 5]->op_type == OP_LINEAR) ||
-          // LLAMA with element-wise operator fusion
-          (l->op_type == OP_LINEAR && layer_idx >= 3 &&
-           layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
-           layers[layer_idx - 2]->op_type == OP_LINEAR &&
-           layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+      config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR &&
+      (/*llama/mpt attention*/
+       (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
+       /*opt/starcoder attention*/
+       (std::string(l->name).find("self_attn.o_proj") != std::string::npos) ||
+       /*falcon attention*/
+       (std::string(l->name).find("self_attention.o_proj") !=
+        std::string::npos) ||
+       /*llama mlp*/
+       (std::string(l->name).find("mlp.down_proj") != std::string::npos) ||
+       /*opt mlp*/
+       (std::string(l->name).find("fc2") != std::string::npos) ||
+       /*falcon mlp*/
+       (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) ||
+       /*mpt mlp*/
+       (std::string(l->name).find("ffn.down_proj") != std::string::npos) ||
+       /*starcoder mlp*/
+       (std::string(l->name).find("mlp.c_proj") != std::string::npos))) {
     return true;
   }
   return false;
 }
 
-#ifdef DEADCODE
-bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
-  auto const &l = layers[layer_idx];
-  // add parallel identity (allreduce in the backward pass) before the lm head
-  // we find the lm head by looking for the linear layer right after a residual
-  // rms norm / layer norm, and before a softmax, followed by
-  // argmax/argtopk/sampling
-  if (config.computationMode == COMP_MODE_INFERENCE &&
-      config.tensor_parallelism_degree > 1 &&
-      ((l->op_type == OP_RESIDUAL_RMS_NORM ||
-        l->op_type == OP_RESIDUAL_LAYERNORM) &&
-       // there are at least 2 layers before the norm, and at least 3 following
-       // the norm
-       layer_idx >= 2 && layer_idx < layers.size() - 3 &&
-       // norm is followed by linear layer (lm head)
-       layers[layer_idx + 1]->op_type == OP_LINEAR &&
-       // lm head is followed by softmax
-       layers[layer_idx + 2]->op_type == OP_SOFTMAX &&
-       // softmax is followed by argmax/argtopk/sampling
-       (layers[layer_idx + 3]->op_type == OP_ARG_TOPK ||
-        layers[layer_idx + 3]->op_type == OP_SAMPLING ||
-        layers[layer_idx + 3]->op_type == OP_ARGMAX ||
-        layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) {
-    return true;
-  }
-  return false;
-}
-#endif
 bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
   auto const &l = layers[layer_idx];
   // add parallel identity (allreduce in the backward pass) before the lm head
@@ -4400,7 +4365,6 @@ FFConfig::FFConfig() {
   enable_peft = DefaultConfig::enablePeft;
   peft_activation_reserve_space_size =
       DefaultConfig::peftActivationReserveSpaceSize;
-  peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize;
   quantization_type = DT_NONE;
   only_data_parallel = DefaultConfig::onlyDataParallel;
   data_parallelism_degree = 1;
@@ -4535,10 +4499,6 @@ void FFConfig::parse_args(char **argv, int argc) {
       peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
       continue;
     }
-    if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) {
-      peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
-      continue;
-    }
     if ((!strcmp(argv[i], "--only-data-parallel"))) {
       only_data_parallel = true;
       continue;
@@ -4852,6 +4812,20 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_weight_task>(
+          registrar, "load_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_weight_task>(
+          registrar);
+    }
+  }
 #endif
   // ElementUnary task
   {
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 5dab73e1a4..3a250539c7 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -168,7 +168,7 @@ FFHandler
   } else {
     handle.batch_config_metadata = nullptr;
   }
-
+  // #ifdef DEADCODE
   if (info->peft_activation_reserve_space_size > 0) {
     // allocate memory for peft activation reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -182,33 +182,8 @@ FFHandler
   } else {
     handle.peft_activation_allocator = nullptr;
   }
-
-  if (info->peft_weight_reserve_space_size > 0) {
-    // allocate memory for peft weight reserve space
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
-    Realm::Rect<1, coord_t> bounds(
-        Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1));
-    std::vector<size_t> field_sizes;
-    field_sizes.push_back(sizeof(char));
-    Realm::RegionInstance workspaceInst;
-    Realm::RegionInstance::create_instance(workspaceInst,
-                                           gpu_mem,
-                                           bounds,
-                                           field_sizes,
-                                           0,
-                                           Realm::ProfilingRequestSet())
-        .wait();
-    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
-    handle.peft_weight_allocator =
-        new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size);
-  } else {
-    handle.peft_weight_allocator = nullptr;
-  }
-  // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
+// #endif
+// checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
 #endif
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
new file mode 100644
index 0000000000..1fcef3678e
--- /dev/null
+++ b/src/runtime/peft_weight_allocator.cc
@@ -0,0 +1,319 @@
+#include "flexflow/utils/peft_weight_allocator.h"
+
+namespace FlexFlow {
+// declare legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+
+void PEFTMemoryManager::allocate_inference_memory() {
+  // allocate chunk of memory for all the PEFT adapters
+  Realm::Rect<1, coord_t> bounds(
+      Realm::Point<1, coord_t>(0),
+      Realm::Point<1, coord_t>(max_lora_size * max_concurrent_adapters - 1));
+  std::vector<size_t> field_sizes;
+  field_sizes.push_back(sizeof(char));
+  Realm::RegionInstance::create_instance(peftLegionInst,
+                                         gpu_mem,
+                                         bounds,
+                                         field_sizes,
+                                         0,
+                                         Realm::ProfilingRequestSet())
+      .wait();
+  base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+}
+
+void PEFTMemoryManager::allocate_finetuning_memory() {
+  size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values
+  ft_size += max_peft_tokens * (in_dim + max_rank) *
+             data_type_size(dt); // input, low-rank activations
+  // allocate chunk of memory for PEFT adapter
+  Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
+                                 Realm::Point<1, coord_t>(ft_size - 1));
+  std::vector<size_t> field_sizes;
+  field_sizes.push_back(sizeof(char));
+  Realm::RegionInstance::create_instance(peftLegionInst,
+                                         gpu_mem,
+                                         bounds,
+                                         field_sizes,
+                                         0,
+                                         Realm::ProfilingRequestSet())
+      .wait();
+  finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+}
+
+void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id,
+                                            bool *cache_miss) {
+  if (finetuning_ptr == nullptr) {
+    allocate_finetuning_memory();
+  }
+  assert(finetuning_ptr != nullptr &&
+         "PEFT Memory Manager finetuning_ptr is null");
+  *cache_miss = (model_id.id != finetuning_model_id.id);
+  finetuning_model_id = model_id;
+}
+
+int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id,
+                                               bool *cache_miss) {
+  assert(base_ptr != nullptr && "PEFT Memory Manager not initialized");
+  assert(lru_hashtable.size() == lru_list.size() &&
+         lru_list.size() == peft2mem_slot.size() &&
+         "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out "
+         "of sync");
+  // check for cache hit
+  if (lru_hashtable.find(model_id) != lru_hashtable.end()) {
+    int lru_list_index = lru_hashtable[model_id];
+    assert(lru_list[lru_list_index] == model_id &&
+           "PEFT Memory Manager LRU hashtable/list are out of sync");
+    // move the model to the end of the LRU list
+    lru_list.erase(lru_list.begin() + lru_list_index);
+    lru_list.push_back(model_id);
+    // update the LRU hashtable
+    lru_hashtable[model_id] = lru_list.size() - 1;
+    // get memory slot
+    assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() &&
+           "PEFT Memory Manager peft2mem_slot is out of sync");
+    *cache_miss = false;
+  } else {
+    // cache miss
+    // check if you need to evict
+    bool need_to_evict = lru_list.size() == max_concurrent_adapters;
+    int mem_slot = -1;
+    if (need_to_evict) {
+      // evict the least recently used model
+      PEFTModelID lru_model_id = lru_list[0];
+      lru_list.erase(lru_list.begin());
+      lru_hashtable.erase(lru_model_id);
+      mem_slot = peft2mem_slot[lru_model_id];
+      peft2mem_slot.erase(lru_model_id);
+    } else {
+      mem_slot = lru_list.size();
+    }
+    // update the LRU list and hashtable
+    lru_list.push_back(model_id);
+    lru_hashtable[model_id] = lru_list.size() - 1;
+    // update the memory slot
+    peft2mem_slot[model_id] = mem_slot;
+    *cache_miss = true;
+  }
+  assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() &&
+         "PEFT Memory Manager peft2mem_slot is out of sync");
+  int slot = peft2mem_slot[model_id];
+  assert(slot >= 0 && slot < max_concurrent_adapters &&
+         "PEFT Memory Manager peft2mem_slot is out of bounds");
+  return slot;
+}
+
+template <typename DT>
+void load_peft_from_file(DT *ptr,
+                         size_t num_rows,
+                         size_t num_columns,
+                         int num_shards,
+                         int shard_id,
+                         std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+
+  // HuggingFace dims (serialized in row-major order)
+  //    lora_A: [rank, intermediate_dim]
+  //    lora_B: [hidden_dim, rank]
+  // FlexFlow dims (serialized in column-major order)
+  //    lora_A: [intermediate_dim, rank]
+  //    lora_B: [rank, out_dim]
+  // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B
+  assert(num_rows % num_shards == 0);
+  size_t chunk_size = num_rows / num_shards;
+  size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0;
+
+  // Allocate memory for the weight shard
+  std::vector<DT> host_array(chunk_size * num_columns);
+  // Read the chunk
+  size_t total_size_read = 0;
+  for (int i = 0; i < num_columns; ++i) {
+    in.seekg((i * num_rows + offset) * sizeof(DT));
+    in.read(reinterpret_cast<char *>(host_array.data() + i * chunk_size),
+            chunk_size * sizeof(DT));
+    total_size_read += in.gcount();
+  }
+  // Check weight shard size
+  size_t expected_data_size = chunk_size * num_columns * sizeof(DT);
+  if (total_size_read != expected_data_size) {
+    printf("load weight data error: expected %lu bytes, got: %lu bytes, data "
+           "size: %lu\n",
+           expected_data_size,
+           total_size_read,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(host_array.size() == chunk_size * num_columns);
+  // Copy weight to device memory
+  copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns);
+  in.close();
+}
+
+void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight,
+                                        LoraLinearConfig const &lora_config) {
+  // Load weights
+  assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr &&
+         "PEFT Memory Manager weight ptr null");
+  int w0_num_elements = lora_config.rank * in_dim;
+  int w1_num_elements = lora_config.rank * out_dim;
+  // values below represent total weight sizes before sharding. Lora B is not
+  // sharded.
+  int lora_A_num_rows = in_dim * num_shards;
+  int lora_A_num_cols = lora_config.rank;
+  int lora_B_num_rows = lora_config.rank;
+  int lora_B_num_cols = out_dim;
+  int lora_A_num_shards = num_shards;
+  int lora_B_num_shards = 1;
+  if (lora_config.init_lora_weights) {
+    // initialize weights randomly
+    int seed = 0;
+    init_peft_weight_wrapper(
+        weight, in_dim, out_dim, lora_config.rank, dt, seed);
+  } else {
+    // load weights from file
+    std::string weights_folder_filepath = join_path({
+        lora_config.cache_folder,
+        "weights",
+        lora_config.peft_model_id,
+        dt == DT_FLOAT ? "full-precision" : "half-precision",
+    });
+    std::string w0_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_A.weight"});
+    std::string w1_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_B.weight"});
+    if (dt == DT_FLOAT) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
+                << ", num_rows: " << lora_A_num_rows
+                << ", num_cols: " << lora_A_num_cols
+                << ", num_shards: " << lora_A_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((float *)weight.w0_ptr,
+                          lora_A_num_rows,
+                          lora_A_num_cols,
+                          lora_A_num_shards,
+                          shard_id,
+                          w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
+                << ", num_rows: " << lora_B_num_rows
+                << ", num_cols: " << lora_B_num_cols
+                << ", num_shards: " << lora_B_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((float *)weight.w1_ptr,
+                          lora_B_num_rows,
+                          lora_B_num_cols,
+                          lora_B_num_shards,
+                          shard_id,
+                          w1_filepath);
+    } else if (dt == DT_HALF) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
+                << ", num_rows: " << lora_A_num_rows
+                << ", num_cols: " << lora_A_num_cols
+                << ", num_shards: " << lora_A_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((half *)weight.w0_ptr,
+                          lora_A_num_rows,
+                          lora_A_num_cols,
+                          lora_A_num_shards,
+                          shard_id,
+                          w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
+                << ", num_rows: " << lora_B_num_rows
+                << ", num_cols: " << lora_B_num_cols
+                << ", num_shards: " << lora_B_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((half *)weight.w1_ptr,
+                          lora_B_num_rows,
+                          lora_B_num_cols,
+                          lora_B_num_shards,
+                          shard_id,
+                          w1_filepath);
+    } else {
+      assert(false && "Data type not supported");
+    }
+  }
+}
+
+LoraLinearWeight
+    PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id,
+                                          LoraLinearConfig const &lora_config) {
+  assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
+  bool cache_miss;
+  int mem_slot = get_inference_peft_slot(model_id, &cache_miss);
+  int w0_num_elements = lora_config.rank * in_dim;
+  int data_size = data_type_size(dt);
+  LoraLinearWeight result;
+  result.w0_ptr = static_cast<char *>(base_ptr) + mem_slot * max_lora_size;
+  result.w1_ptr =
+      static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
+  if (cache_miss) {
+    load_peft_model(result, lora_config);
+  }
+  return result;
+}
+
+LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(
+    PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
+  assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
+  bool cache_miss;
+  get_finetuning_slot(model_id, &cache_miss);
+  int w0_num_elements = lora_config.rank * in_dim;
+  int w1_num_elements = lora_config.rank * out_dim;
+  int data_size = data_type_size(dt);
+  LoraLinearWeight result;
+  result.w0_ptr = finetuning_ptr;
+  result.w1_ptr =
+      static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
+  result.w0_grad_ptr =
+      static_cast<char *>(result.w1_ptr) + w1_num_elements * data_size;
+  result.w1_grad_ptr =
+      static_cast<char *>(result.w0_grad_ptr) + w0_num_elements * data_size;
+  result.w0_v_values_ptr =
+      static_cast<char *>(result.w1_grad_ptr) + w1_num_elements * data_size;
+  result.w1_v_values_ptr =
+      static_cast<char *>(result.w0_v_values_ptr) + w0_num_elements * data_size;
+  result.input_activation =
+      static_cast<char *>(result.w1_v_values_ptr) +
+      w1_num_elements * data_size; // max_peft_tokens*in_dim
+  result.low_rank_activation =
+      static_cast<char *>(result.input_activation) +
+      max_peft_tokens * in_dim * data_size; // max_peft_tokens*rank
+  if (cache_miss) {
+    load_peft_model(result, lora_config);
+  }
+  return result;
+}
+
+LoraLinearWeight
+    PEFTMemoryManager::get_peft(PEFTModelID const &model_id,
+                                LoraLinearConfig const &lora_config) {
+  if (lora_config.trainable) {
+    return get_finetuning_peft(model_id, lora_config);
+  } else {
+    return get_inference_peft(model_id, lora_config);
+  }
+}
+
+void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) {
+  assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!");
+}
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu
new file mode 100644
index 0000000000..3c4ea91db3
--- /dev/null
+++ b/src/runtime/peft_weight_allocator.cu
@@ -0,0 +1,80 @@
+
+
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/utils/cuda_helper.h"
+#include "flexflow/utils/peft_weight_allocator.h"
+#include <random>
+#include <vector>
+namespace FlexFlow {
+
+template <typename DT>
+void lora_init_kernel(LoraLinearWeight const &weight,
+                      int in_dim,
+                      int out_dim,
+                      int rank,
+                      int seed,
+                      cudaStream_t stream) {
+  // Initialize generator
+  std::mt19937 gen(seed);
+
+  // Get handle to weights by iterating over m->model_state to get each
+  // LoraLinearWeight object
+  int w0_num_elements = rank * in_dim;
+  int w1_num_elements = rank * out_dim;
+
+  // LoRA_A weight: [in_dim, rank]
+  float stdv_lora_a = 1.0f / sqrt(in_dim);
+  std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+  std::vector<DT> lora_a_random_init(w0_num_elements);
+  for (auto &num : lora_a_random_init) {
+    float num_float = dis_lora_a(gen);
+    if (std::is_same<DT, half>::value) {
+      num = __float2half(num_float);
+    } else {
+      num = num_float;
+    }
+  }
+  checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                            lora_a_random_init.data(),
+                            w0_num_elements * sizeof(DT),
+                            cudaMemcpyHostToDevice,
+                            stream));
+
+  // LoRA_B weight: [rank, out_dim]
+  float stdv_lora_b = 1.0f / sqrt(rank);
+  std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+  std::vector<float> lora_b_random_init(w1_num_elements);
+  for (auto &num : lora_b_random_init) {
+    float num_float = dis_lora_b(gen);
+    if (std::is_same<DT, half>::value) {
+      num = __float2half(num_float);
+    } else {
+      num = num_float;
+    }
+  }
+  checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
+                            lora_b_random_init.data(),
+                            w1_num_elements * sizeof(DT),
+                            cudaMemcpyHostToDevice,
+                            stream));
+}
+
+void init_peft_weight_wrapper(LoraLinearWeight const &weight,
+                              int in_dim,
+                              int out_dim,
+                              int rank,
+                              DataType dt,
+                              int seed) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  if (dt == DT_FLOAT) {
+    lora_init_kernel<float>(weight, in_dim, out_dim, rank, seed, stream);
+  } else if (dt == DT_HALF) {
+    lora_init_kernel<half>(weight, in_dim, out_dim, rank, seed, stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
+
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 193abbb455..fddaae09ce 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -263,6 +263,73 @@ size_t RequestManager::get_num_ssms() {
   return ssm_models.size();
 }
 
+void RequestManager::set_peft_config(PEFTModelID const &peft_model_id,
+                                     LoraLinearConfig const &peft_config) {
+  // check that peft_model_id is not already in use
+  assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
+         "PEFT model ID already in use");
+  // LoraLinearConfig new_config =
+  // LoraLinearConfig::deserialize_from_json_string(
+  //     peft_config.serialize_to_json_string());
+  peft_configs[peft_model_id] = peft_config;
+}
+
+LoraLinearConfig const &
+    RequestManager::get_peft_config(PEFTModelID const &peft_model_id) {
+  assert(peft_configs.find(peft_model_id) != peft_configs.end() &&
+         "PEFT model ID not found");
+  return peft_configs[peft_model_id];
+}
+
+void RequestManager::set_max_lora_rank(int max_lora_rank_) {
+  max_lora_rank = max_lora_rank_;
+}
+
+void RequestManager::set_max_concurrent_adapters(int max_concurrent_adapters_) {
+  max_concurrent_adapters = max_concurrent_adapters_;
+}
+
+int RequestManager::get_max_lora_rank() {
+  return max_lora_rank;
+}
+
+int RequestManager::get_max_concurrent_adapters() {
+  return max_concurrent_adapters;
+}
+
+PEFTModelID *
+    FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
+  if (peft_config.target_modules.size() == 0) {
+    printf("PEFT config does not contain any target module\n");
+    std::cout << peft_config << std::endl;
+    assert(false);
+  }
+  std::cout << "Registering PEFT adapter"
+            << peft_config.serialize_to_json_string() << std::endl;
+  // go over base_layer_to_peft_layer and check that you can find at least one
+  // match
+  for (int i = 0; i < peft_config.target_modules.size(); i++) {
+    bool found = false;
+    for (auto const &pair : base_layer_to_peft_layer) {
+      Layer *base_layer = pair.first;
+      if (base_layer->name != nullptr && strlen(base_layer->name) > 0 &&
+          std::string(base_layer->name).find(peft_config.target_modules[0]) !=
+              std::string::npos) {
+        found = true;
+        break;
+      }
+    }
+    assert(found && "Attempting to add LoRA to a LLM target module that does "
+                    "not exist or does not support LoRA");
+  }
+  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_peft_config(*peft_model_id, peft_config);
+  return peft_model_id;
+}
+
 RequestManager::RequestGuid
     RequestManager::register_new_request(Request const &request_) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
@@ -628,6 +695,18 @@ void RequestManager::check_batch(BatchConfig const &old_bc,
   }
 }
 
+void RequestManager::add_peft_config_to_request_info(
+    BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) {
+  std::memset(bc.requestsInfo[req_idx].peft_model_config_str,
+              0,
+              BatchConfig::MAX_PEFT_CONFIG_SIZE);
+  std::string peft_config_str = peft_config.serialize_to_json_string();
+  std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str,
+              peft_config_str.c_str());
+  // std::cout << "Added PEFT config to request info: "
+  //           << bc.requestsInfo[req_idx].peft_model_config_str << std::endl;
+}
+
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
@@ -666,6 +745,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   int inference_batch_size =
       BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
 
+  int num_concurrent_adapters = 0;
+
   // Step 2: prepare the next batch for existing inference requests
   BatchConfig new_bc;
   for (int i = 0; i < inference_batch_size; i++) {
@@ -684,6 +765,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(processed_tokens < request.tokens.size());
       bool request_completed = check_inf_req_completion(old_bc, i);
       if (request_completed) {
+        if (is_eos_token(request.tokens.back())) {
+          // remove the EOS token
+          request.tokens.pop_back();
+        }
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
@@ -760,6 +845,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             old_bc.requestsInfo[i].request_guid;
         new_bc.requestsInfo[i].peft_model_id =
             old_bc.requestsInfo[i].peft_model_id;
+        std::strcpy(new_bc.requestsInfo[i].peft_model_config_str,
+                    old_bc.requestsInfo[i].peft_model_config_str);
+        if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) {
+          num_concurrent_adapters += 1;
+        }
         new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         num_active_req++;
@@ -811,6 +901,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
   new_bc.num_generation_tokens = num_generation_tokens;
 
+  assert(num_concurrent_adapters <= get_max_concurrent_adapters() &&
+         "Number of concurrent adapters exceeded the limit");
+
   // Step 3: add new inference requests to the next batch if there is space
   for (int i = 0; i < inference_batch_size; i++) {
     if (new_bc.request_completed[i]) {
@@ -818,6 +911,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
         assert(new_request.req_type == RequestType::REQ_INFERENCE);
+
+        // if the request has peft adapters and we are at capacity, don't add it
+        // yet
+        if (new_request.peft_model_id != PEFTModelID::NO_ID &&
+            num_concurrent_adapters == get_max_concurrent_adapters()) {
+          break;
+        }
+
         pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
@@ -829,6 +930,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        if (new_request.peft_model_id != PEFTModelID::NO_ID) {
+          add_peft_config_to_request_info(
+              new_bc, i, get_peft_config(new_request.peft_model_id));
+        }
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
         new_bc.requestsInfo[i].prompt_phase = true;
@@ -983,7 +1088,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     int num_peft_label_tokens = request.dataset[dataset_entry].second.size();
     assert(num_peft_label_tokens == 0);
 
-    if (num_peft_tokens > 0) {
+    if (num_peft_tokens > 0 &&
+        num_concurrent_adapters < get_max_concurrent_adapters()) {
       assert(new_bc.request_completed[inference_batch_size]);
       // request info
       new_bc.request_completed[inference_batch_size] = false;
@@ -995,9 +1101,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           num_peft_tokens;
       new_bc.requestsInfo[inference_batch_size].max_length = request.max_length;
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
+      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
       new_bc.requestsInfo[inference_batch_size].peft_model_id =
           request.peft_model_id;
-      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
+      add_peft_config_to_request_info(
+          new_bc, inference_batch_size, get_peft_config(request.peft_model_id));
       set_optimizer_tasks(
           new_bc.requestsInfo[inference_batch_size].optimizer_tasks,
           request.max_training_steps,
@@ -1015,8 +1123,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.num_tokens++;
         new_bc.num_peft_tokens++;
       }
+      num_concurrent_adapters += 1;
     }
   }
+  assert(num_concurrent_adapters <= get_max_concurrent_adapters() &&
+         "Number of concurrent adapters exceeded the limit");
   return new_bc;
 }
 
@@ -2914,7 +3025,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
   assert(im->model_weights_loaders.find(llm) !=
          im->model_weights_loaders.end());
   // Load model weights
-  im->model_weights_loaders[llm]->load_weights(llm);
+  im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
   // init operators
   im->init_operators_inference(llm);
   // Legion futures for inc_decoding and spec_infer
@@ -2976,7 +3087,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
+    im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
     // init operators
     im->init_operators_inference(llm);
   }
@@ -2987,7 +3098,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[ssm]->load_weights(ssm);
+    im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime);
     // init operators
     im->init_operators_inference(ssm);
   }
diff --git a/tests/inference/huggingface_inference_simple.py b/tests/inference/huggingface_inference_simple.py
new file mode 100644
index 0000000000..f1cf8450b7
--- /dev/null
+++ b/tests/inference/huggingface_inference_simple.py
@@ -0,0 +1,51 @@
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    GenerationConfig,
+)
+
+model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+do_sample = False
+max_length = 128
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",)
+hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+generation_config = GenerationConfig.from_pretrained(model_name)
+print(generation_config.do_sample)
+generation_config.do_sample = do_sample
+generation_config.num_beams=1
+generation_config.temperature = None
+generation_config.top_p = None
+
+
+def run_text_completion():
+    prompt = "Help me plan a 1-week trip to Dubai"
+    batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
+    generated = model.generate(
+        batch["input_ids"],
+        max_new_tokens=max_length,
+        generation_config=generation_config,
+    )
+    out = tokenizer.decode(generated[0])
+    print(out)
+
+def run_chat_completion():
+    messages=[
+        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    batch = tokenizer(tokenized_chat, return_tensors="pt")
+
+    generated = model.generate(
+        batch["input_ids"],
+        max_new_tokens=max_length,
+        generation_config=generation_config,
+    )
+    out = tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    prompt_length = len(tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    all_text = out[prompt_length:]
+    print(all_text)
+run_chat_completion()
\ No newline at end of file
diff --git a/tests/inference/huggingface_pipeline.py b/tests/inference/huggingface_pipeline.py
new file mode 100644
index 0000000000..95388e0a4b
--- /dev/null
+++ b/tests/inference/huggingface_pipeline.py
@@ -0,0 +1,33 @@
+import transformers
+from transformers import GenerationConfig
+
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+do_sample = False
+
+generation_config = GenerationConfig.from_pretrained(model_id)
+generation_config.do_sample = do_sample
+generation_config.num_beams=1
+# generation_config.max_length = 128
+generation_config.temperature = None
+generation_config.top_p = None
+print(generation_config)
+
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    # model_kwargs={"torch_dtype": torch.bfloat16},
+    device_map="auto",
+)
+
+messages=[
+        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    
+# messages="Help me plan a 1-week trip to Dubai"
+outputs = pipeline(
+    messages,
+    max_new_tokens=128,
+    generation_config=generation_config,
+)
+print(outputs[0]["generated_text"][-1]['content'])
\ No newline at end of file
diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py
index 8dab7ff43b..1fe2bfbaae 100644
--- a/tests/inference/inference_alignment_test.py
+++ b/tests/inference/inference_alignment_test.py
@@ -361,7 +361,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
         hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
         ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)[:,:,-1].squeeze()
         hf_tensor = hf_tensor.squeeze()
-        print(hf_tensor.shape, ff_tensor.shape)
+        # print(hf_tensor.shape, ff_tensor.shape)
         compare(hf_tensor, ff_tensor, label="LM head input")
         output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
         hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 2720304d4f..afb7ffb9a7 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -8,8 +8,8 @@
     "memory_per_gpu": 14000,
     "zero_copy_memory_per_node": 40000,
     # optional parameters
-    "num_cpus": 4,
-    "legion_utility_processors": 4,
+    "num_cpus": 8,
+    "legion_utility_processors": 8,
     "data_parallelism_degree": 1,
     "tensor_parallelism_degree": 1,
     "pipeline_parallelism_degree": 4,
@@ -19,7 +19,6 @@
     "use_8bit_quantization": False,
     "enable_peft": False,
     "peft_activation_reserve_space_size": 1024, # 1GB
-    "peft_weight_reserve_space_size": 1024, # 1GB
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,
@@ -63,15 +62,14 @@
 # starcoder_models = ["bigcode/starcoderbase-7b",]
 parallelism_settings = [(1, 4), (2, 2), (4, 1)]
 
-# The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference)
-prompt_file = "../../inference/prompt/test.json"
-output_folder = "../../inference/output"
-
 # Change working dir to folder storing this script
 abspath = os.path.abspath(__file__)
 dname = os.path.dirname(abspath)
 os.chdir(dname)
 
+prompt_file = os.path.abspath("../../../inference/prompt/test.json")
+output_folder = os.path.abspath("../../../inference/output")
+
 
 # Generate incremental decoding configs
 all_models = llama_models + opt_models + falcon_models + mpt_models
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index f5ed8ae65b..a8a9be2f3b 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -430,7 +430,7 @@ def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2):
         print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
         print(np.isclose(hf_tensor, ff_tensor, atol=tolerance))
         mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
-        print(mismatches)
+        # print(mismatches)
     len_hf_tensor = hf_tensor.flatten().shape[0]
     assert len(mismatches) <= 0.05 * len_hf_tensor
     print("Ok!")
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index a2fc5548ab..8a53ef8c9c 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -77,7 +77,7 @@ def main():
     if args.save_peft_tensors:
         make_debug_dirs()
         register_peft_hooks(model)
-        save_model_weights(model, target_modules=["lora", "lm_head", "down_proj"])
+        save_model_weights(model, target_modules=["lora", "lm_head", "down_proj", "up_proj"])
 
     # Load fine-tuning dataset
     data = load_dataset("Abirate/english_quotes")
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index cc677cd51a..c4db87c099 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -17,7 +17,7 @@ def check_bwd_pass(self):
     def check_step(self, step_idx, learning_rate=0.001):
         raise NotImplementedError()
 
-class LllamaAlignmentTest(AlignmentTest):
+class LlamaAlignmentTest(AlignmentTest):
     def __init__(self, model_name, tp_degree=1):
         self.model_name = model_name
         self.peft_config = PeftConfig.from_pretrained(model_name)
@@ -485,12 +485,16 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
             compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output")
+            down_proj_grad_output_pre = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE, pre=True)
+            down_proj_grad_output = ff_tensor.clone()
+            compare_loaded_tensors(down_proj_grad_output, down_proj_grad_output_pre)
 
             # LoRA_B
             hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
             ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            lora_grad_output = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor
             compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output")
 
@@ -501,6 +505,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
             compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input")
+            lora_a_grad_input = ff_tensor.clone()
 
             # W2 (down_proj) input
             hf_tensor_name = f"layers.{i}.mlp.down_proj"
@@ -508,7 +513,15 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            down_proj_grad_input_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, pre=True)
             compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input")
+
+            # down proj output (before/after kernel) should match output of lora_b
+            compare_loaded_tensors(down_proj_grad_output, lora_grad_output)
+            # down proj input (before kernel) should match input of lora_a
+            compare_loaded_tensors(down_proj_grad_input_pre, lora_a_grad_input)
+            # compare_loaded_tensors(down_proj_grad_input_pre.squeeze(), ff_tensor.squeeze())
+
             
             # W2 input (HF) and SigmoidSiluMulti output (FF)
             hf_w2_input = hf_tensor.clone()
@@ -538,11 +551,47 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            # print(f"w3 {i} grad output")
+            # print("flexflow tensor shape:", ff_tensor.squeeze().shape)
+            # print(ff_tensor.squeeze())
+            # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
+            # print(hf_tensor.squeeze().T)
             compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output")
+            # print(f"W3 {i} output matches!")
+            # print(f"FF shape: {ff_tensor.shape}")
+            # print(f"HF shape: {hf_tensor.shape}")
+
+            # hf_w3_output = hf_tensor.clone()
+
+
             # W3 (up_proj) input
             input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+
+            # w3_input_torch = torch.matmul(hf_tensor, torch.transpose(ff_tensor, 0, 1))
+            # ff_up_proj_weight_path="/usr/.cache/flexflow/debug/flexflow/weights/step_0/shard_0/layers.11.layers.11.mlp.up_proj.weight_0"
+            # hf_up_proj_weight_path="/usr/.cache/flexflow/debug/huggingface/weights/step_0/layers.11.mlp.up_proj.weight"
+            # hf_up_proj_weight = torch.load(hf_up_proj_weight_path, map_location='cpu')
+            # print(hf_up_proj_weight.shape)
+            # ff_up_proj_weight = load_ff_tensor(ff_up_proj_weight_path, hf_up_proj_weight.shape[::-1])
+            # print(ff_up_proj_weight.shape)
+            # ff_up_proj_weight = torch.from_numpy(ff_up_proj_weight).to(hf_up_proj_weight.dtype)
+            # assert torch.allclose(hf_up_proj_weight.T, ff_up_proj_weight, atol=1e-5)
+            
+            # print("HF W3 output shape:", hf_w3_output.shape)
+            # print("HF W3 weight shape:", hf_up_proj_weight.shape)
+            # print("HF W3 input shape:", hf_tensor.shape)
+
+            # simulated_w3_input = torch.matmul(hf_w3_output.squeeze(), hf_up_proj_weight)
+            # print("simulated W3 input shape:", simulated_w3_input.T.shape)
+            # print(simulated_w3_input.T)
+            # print(f"w3 {i} grad input")
+            # print("flexflow tensor shape:", ff_tensor.squeeze().shape)
+            # print(ff_tensor.squeeze())
+            # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
+            # print(hf_tensor.squeeze().T)
+
             compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input")
 
             # Attn O-proj
@@ -606,7 +655,8 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
                 ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm"
                 _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1)
                 input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
-                torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5)
+                compare_loaded_tensors(attn_input, input_layernorm_out1, tolerance=1e-5)
+                # torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5)
 
                 # Input layernorm
                 
@@ -695,7 +745,24 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
             torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5)
             ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
             ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE)
+            
+            lora_low_rank_activation_fwd_path = f"/usr/.cache/flexflow/debug/flexflow/fwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            lora_low_rank_activation_bwd_path = f"/usr/.cache/flexflow/debug/flexflow/bwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            lora_low_rank_activation_fwd = load_ff_tensor(lora_low_rank_activation_fwd_path, [16, 128])[:,:self.num_tokens]
+            lora_low_rank_activation_fwd = torch.from_numpy(lora_low_rank_activation_fwd)
+            lora_low_rank_activation_bwd = load_ff_tensor(lora_low_rank_activation_bwd_path, [16, 24])
+            lora_low_rank_activation_bwd = torch.from_numpy(lora_low_rank_activation_bwd)
+            torch.testing.assert_close(lora_low_rank_activation_fwd, lora_low_rank_activation_bwd, rtol=1.3e-6, atol=1e-5)
+            
+            # print(f"LoRA_B {i} gradient")
+            # print("FlexFlow shape: ", ff_gradient.shape)
+            # print(ff_gradient)
+            # print("HuggingFace shape: ", hf_gradient.shape)
+            # print(hf_gradient.squeeze().T)
             compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient")
+
+            
+
             # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0"
             # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
             # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0")
@@ -737,7 +804,7 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
 args = parser.parse_args()
 
 if __name__ == "__main__":
-    llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree)
+    llama_alignment = LlamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree)
     # llama_alignment.check_weights_alignment()
     for i in range(args.num_steps):
         llama_alignment.check_fwd_pass(i)
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 5600d57edf..e497d4224e 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -31,22 +31,22 @@ mkdir -p ./inference/output
 export LEGION_BACKTRACE=1
 
 # Download test model
-python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
+python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora
 
 # Run PEFT in Huggingface to get ground truth tensors
-python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
+python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 0.001
 
 # Python test
 echo "Python test"
 python ./inference/python/ff_peft.py
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -tp 2
+python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001
 
 # C++ test
 echo "C++ test"
 ./build/inference/peft/peft \
-    -ll:gpu 2 -ll:cpu 4 -ll:util 4 \
-    -tensor-parallelism-degree 2 \
+    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 4 \
     -ll:fsize 8192 -ll:zsize 12000 \
     -llm-model JackFram/llama-160m \
     -finetuning-dataset ./inference/prompt/peft_dataset.json \
@@ -55,7 +55,7 @@ echo "C++ test"
     --use-full-precision \
     --inference-debugging
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -tp 2
+python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001
 
 # Print succeess message
 echo ""

From 78307b0e8beb5d41ee003be8b5db168c2b3ef4e2 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 26 Nov 2024 19:13:07 +0000
Subject: [PATCH 43/44] update

---
 docker/run.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docker/run.sh b/docker/run.sh
index 759da521aa..62d7468a00 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -127,8 +127,7 @@ fi
 
 ssh_key_volume=""
 ssh_key_path="$HOME/.ssh/id_rsa"
-if [ -f "$ssh_key_path" ]; then
-  # If the token exists, add the volume mount to the Docker command
-  ssh_key_volume+="-v $ssh_key_path:/root/.ssh/id_rsa"
+if [ -f "$ssh_key_path" ] && [ -f "$ssh_key_path.pub" ]; then
+  ssh_key_volume="-v $ssh_key_path:/root/.ssh/id_rsa -v $ssh_key_path.pub:/root/.ssh/id_rsa.pub"
 fi
 eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${ssh_key_volume}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"

From 518543808b6cd0564e0537601f9d326023d4fe5c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 28 Nov 2024 21:41:35 +0000
Subject: [PATCH 44/44] fix file loader

---
 include/flexflow/utils/file_loader.h |  31 +++++--
 src/runtime/file_loader.cc           | 129 +++++++++++++++++++--------
 2 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 8735f23571..8ad0f1d14e 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -21,6 +21,7 @@
 
 using namespace std;
 using namespace FlexFlow;
+using namespace Legion;
 
 class FileDataLoader {
 public:
@@ -36,16 +37,31 @@ class FileDataLoader {
   BatchConfig::TokenId *generate_requests(int num, int length);
 
   template <typename DT>
-  void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
+  void load_single_weight_tensor(FFModel *ff,
+                                 Layer *l,
+                                 int weight_idx,
+                                 size_t volume,
+                                 size_t num_replicas,
+                                 DT *weight,
+                                 Domain weight_domain);
 
-  void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
+  void load_quantization_weight(FFModel *ff,
+                                Layer *l,
+                                int weight_idx,
+                                size_t volume,
+                                size_t num_replicas,
+                                char *weight,
+                                DataType data_type,
+                                Domain weight_domain);
 
   static void
       load_weight_task(Legion::Task const *task,
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
+  void load_weights_parallel(FFModel *ff,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -66,12 +82,15 @@ struct WeightLoadTaskArgs {
   FileDataLoader *loader;
   Layer *layer;
   int weight_idx;
+  size_t volume, num_replicas;
   DataType data_type;
   WeightLoadTaskArgs(FFModel *_ff,
                      FileDataLoader *_loader,
                      Layer *_l,
                      int _idx,
+                     size_t _volume,
+                     size_t _num_replicas,
                      DataType _data_type)
-      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx),
-        data_type(_data_type) {}
-};
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), volume(_volume),
+        num_replicas(_num_replicas), data_type(_data_type) {}
+};
\ No newline at end of file
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 3ebe6cf095..6ffa9370f0 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -670,14 +670,20 @@ void load_from_quantized_file(char *ptr,
 
 void FileDataLoader::load_quantization_weight(FFModel *ff,
                                               Layer *l,
-                                              int weight_idx) {
-  Tensor weight = l->weights[weight_idx];
-  size_t volume = 1;
+                                              int weight_idx,
+                                              size_t volume,
+                                              size_t num_replicas,
+                                              char *weight,
+                                              DataType data_type,
+                                              Domain weight_domain) {
+  size_t volume_ = 1;
   std::vector<int> dims_vec;
-  for (int i = 0; i < weight->num_dims; i++) {
-    dims_vec.push_back(weight->dims[i]);
-    volume *= weight->dims[i];
+  for (int i = 0; i < weight_domain.get_dim(); i++) {
+    int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1;
+    dims_vec.push_back(dim_i);
+    volume_ *= dim_i;
   }
+  assert(volume_ == volume * num_replicas);
   char *data = (char *)malloc(sizeof(char) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
@@ -692,7 +698,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
                                        qkv_inner_dim,
                                        weight_filename,
                                        weights_folder,
-                                       weight->data_type,
+                                       data_type,
                                        use_full_precision);
     }
     // else {
@@ -714,31 +720,38 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     load_from_quantized_file(data,
                              volume,
                              join_path({weights_folder, weight_filename}),
-                             weight->data_type,
+                             data_type,
                              use_full_precision);
   }
 
-  ParallelTensor weight_pt;
-  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<char>(ff, dims_vec, data);
+  char *ptr = weight;
+  for (size_t i = 0; i < num_replicas; i++) {
+    memcpy(ptr, data, volume * sizeof(char));
+    ptr += volume;
+  }
 
-  delete data;
+  free(data);
 }
 
 template <typename DT>
 void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                                Layer *l,
-                                               int weight_idx) {
-  Tensor weight = l->weights[weight_idx];
+                                               int weight_idx,
+                                               size_t volume,
+                                               size_t num_replicas,
+                                               DT *weight,
+                                               Domain weight_domain) {
 
   // Create a buffer to store weight data from the file
-  size_t volume = 1;
+  size_t volume_ = 1;
   std::vector<int> dims_vec;
-  for (int i = 0; i < weight->num_dims; i++) {
-    dims_vec.push_back(weight->dims[i]);
-    volume *= weight->dims[i];
+  for (int i = 0; i < weight_domain.get_dim(); i++) {
+    int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1;
+    dims_vec.push_back(dim_i);
+    volume_ *= dim_i;
   }
-  assert(data_type_size(weight->data_type) == sizeof(DT));
+  assert(volume_ == volume * num_replicas);
+  // assert(data_type_size(weight->data_type) == sizeof(DT));
   DT *data = (DT *)malloc(sizeof(DT) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
@@ -843,13 +856,15 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     }
   }
 
-  // Copy the weight data from the buffer to the weight's ParallelTensor
-  ParallelTensor weight_pt;
-  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<DT>(ff, dims_vec, data);
+  // Copy the weight data from the buffer to the weight
+  DT *ptr = weight;
+  for (size_t i = 0; i < num_replicas; i++) {
+    memcpy(ptr, data, volume * sizeof(DT));
+    ptr += volume;
+  }
 
   // Free buffer memory
-  delete data;
+  free(data);
 }
 
 void FileDataLoader::load_weight_task(
@@ -859,21 +874,44 @@ void FileDataLoader::load_weight_task(
     Legion::Runtime *runtime) {
   WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
 
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 1); // one weight only
+  GenericTensorAccessorW weight = helperGetGenericTensorAccessorWO(
+      args->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  Domain weight_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+
   switch (args->data_type) {
     case DT_HALF: {
-      args->loader->load_single_weight_tensor<half>(
-          args->ff, args->layer, args->weight_idx);
+      args->loader->load_single_weight_tensor<half>(args->ff,
+                                                    args->layer,
+                                                    args->weight_idx,
+                                                    args->volume,
+                                                    args->num_replicas,
+                                                    weight.get_half_ptr(),
+                                                    weight_domain);
       break;
     }
     case DT_FLOAT: {
-      args->loader->load_single_weight_tensor<float>(
-          args->ff, args->layer, args->weight_idx);
+      args->loader->load_single_weight_tensor<float>(args->ff,
+                                                     args->layer,
+                                                     args->weight_idx,
+                                                     args->volume,
+                                                     args->num_replicas,
+                                                     weight.get_float_ptr(),
+                                                     weight_domain);
       break;
     }
     case DT_INT4:
     case DT_INT8: {
-      args->loader->load_quantization_weight(
-          args->ff, args->layer, args->weight_idx);
+      args->loader->load_quantization_weight(args->ff,
+                                             args->layer,
+                                             args->weight_idx,
+                                             args->volume,
+                                             args->num_replicas,
+                                             weight.get_byte_ptr(),
+                                             args->data_type,
+                                             weight_domain);
       break;
     }
     default:
@@ -897,19 +935,38 @@ void FileDataLoader::load_weights_parallel(FFModel *ff,
         continue;
       }
 
-      if (l->op_type == OP_LORA) {
-        continue;
-      }
-
       if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF &&
           weight->data_type != DT_INT4 && weight->data_type != DT_INT8) {
         assert(false && "Unsupported data type");
       }
 
+      ParallelTensor weight_pt;
+      ff->get_parallel_tensor_from_tensor(weight, weight_pt);
+
       // Create task arguments
-      WeightLoadTaskArgs args(ff, this, l, i, weight->data_type);
+      size_t volume = 1, num_replicas = 1;
+      if (weight_pt->sync_type == ParameterSyncType::NCCL) {
+        for (int i = 0; i < weight_pt->num_dims; i++) {
+          if (weight_pt->dims[i].is_replica_dim) {
+            num_replicas *= weight_pt->dims[i].size;
+          }
+        }
+      } else if (weight_pt->sync_type == ParameterSyncType::PS) {
+        num_replicas = 1;
+      } else {
+        num_replicas = 1;
+      }
+      for (int i = 0; i < weight->num_dims; i++) {
+        volume *= weight->dims[i];
+      }
+      WeightLoadTaskArgs args(
+          ff, this, l, i, volume, num_replicas, weight->data_type);
+      // launch task asynchronously
       TaskLauncher launcher(LOAD_WEIGHT_TASK_ID,
                             TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+      launcher.add_region_requirement(RegionRequirement(
+          weight_pt->region, WRITE_ONLY, EXCLUSIVE, weight_pt->region));
+      launcher.add_field(0, FID_DATA);
       futures.push_back(runtime->execute_task(ctx, launcher));
     }
   }
@@ -918,4 +975,4 @@ void FileDataLoader::load_weights_parallel(FFModel *ff,
   for (Future &f : futures) {
     f.get_void_result();
   }
-}
+}
\ No newline at end of file