From cbfd6528ff286c10b8354fc9e43337c496f5f2b2 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 27 Sep 2023 10:21:25 -0400
Subject: [PATCH 001/198] .

---
 include/flexflow/batch_config.h               |   3 +-
 include/flexflow/config.h                     |   9 +-
 include/flexflow/layer.h                      |   2 +-
 include/flexflow/model.h                      |   1 +
 include/flexflow/op_meta.h                    |   3 +-
 include/flexflow/operator.h                   |  10 +-
 include/flexflow/ops/experts.h                |   2 +-
 include/flexflow/ops/kernels/linear_kernels.h |  21 ++++
 include/flexflow/ops/linear.h                 |   9 ++
 .../ops/tree_inc_multihead_self_attention.h   |   2 +-
 include/flexflow/utils/memory_allocator.h     |   5 +
 src/ops/arg_topk.cc                           |   2 +-
 src/ops/argmax.cc                             |   6 +-
 src/ops/beam_topk.cc                          |   2 +-
 src/ops/conv_2d.cc                            |  13 +-
 src/ops/element_binary.cc                     |  10 +-
 src/ops/experts.cc                            |   4 +-
 src/ops/experts.cpp                           |   2 +-
 src/ops/experts.cu                            |   8 +-
 src/ops/fused.cpp                             |   2 +-
 src/ops/fused.cu                              |   2 +-
 src/ops/inc_multihead_self_attention.cc       |  46 +++----
 src/ops/inc_multihead_self_attention.cpp      |  10 +-
 src/ops/inc_multihead_self_attention.cu       |  10 +-
 src/ops/kernels/linear_kernels.cpp            |  97 +++++++++++++++
 src/ops/kernels/linear_kernels.cu             | 117 ++++++++++++++++++
 src/ops/linear.cc                             | 107 ++++++++++++++--
 src/ops/sampling.cc                           |   2 +-
 src/ops/spec_inc_multihead_self_attention.cpp |   4 +-
 src/ops/spec_inc_multihead_self_attention.cu  |   4 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  28 ++---
 src/ops/tree_inc_multihead_self_attention.cu  |  28 ++---
 src/runtime/batch_config.cc                   |  15 ++-
 src/runtime/inference_manager.cc              |   6 +-
 src/runtime/model.cc                          |  27 +++-
 35 files changed, 500 insertions(+), 119 deletions(-)
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ce331d3e41..179e28c246 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -42,7 +42,8 @@ class BatchConfig {
   using TokenId = int;
   BatchConfig();
   int num_active_requests() const;
-  int num_active_tokens() const;
+  int num_active_infr_tokens() const;
+  int num_active_peft_tokens() const;
   void print() const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 9716060173..e670bd72fb 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -15,7 +15,7 @@
 
 #ifndef _FLEXFLOW_CONFIG_H_
 #define _FLEXFLOW_CONFIG_H_
-#include "ffconst.h"
+#include "flexflow/ffconst.h"
 #include "legion.h"
 #include <cstring>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -64,6 +64,7 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 #endif
 
 class FFConfig;
+class MemoryAllocator;
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -77,6 +78,11 @@ struct FFHandler {
   size_t workSpaceSize;
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
+  // PEFT related fields
+  void *peft_activation_reserve_space;
+  size_t peft_activation_reserve_space_size;
+  MemoryAllocator* peft_activation_allocator;
+  // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
@@ -87,6 +93,7 @@ struct FFHandler {
 struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
+  size_t peft_activation_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 0c1d7a6092..68d292dfe0 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -49,7 +49,7 @@ class Layer {
   Tensor outputs[MAX_NUM_OUTPUTS];
   Tensor inputs[MAX_NUM_INPUTS];
   Tensor weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  //bool trainable_inputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index f88f96cd5a..763610e4cf 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -113,6 +113,7 @@ enum TaskIDs {
   LINEAR_INIT_TASK_ID,
   LINEAR_INIT_PARA_TASK_ID,
   LINEAR_INF_TASK_ID,
+  LINEAR_PEFT_BWD_TASK_ID,
   LINEAR_FWD_TASK_ID,
   LINEAR_BWD_TASK_ID,
   LINEAR_BWD2_TASK_ID,
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index 512844db92..3299201f43 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -15,7 +15,8 @@ class OpMeta {
 public:
   FFHandler handle;
   bool profiling; // Measure the run time of the task
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   DataType input_type[MAX_NUM_INPUTS];
   DataType weight_type[MAX_NUM_WEIGHTS];
   DataType output_type[MAX_NUM_OUTPUTS];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 1b2fc7bbfc..cce92a6bd8 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -224,6 +224,13 @@ class Op {
                                       MachineView const *mv = nullptr) {
     assert(false);
   };
+  virtual Legion::FutureMap peft_bwd(FFModel const&,
+                                     BatchConfigFuture const &,
+                                     std::vector<ParallelTensor> const &,
+                                     std::vector<ParallelTensor> const &,
+                                     MachineView const *mv = nullptr) {
+    assert(false);
+  }
   virtual void print_layer(FFModel const &model) = 0;
   virtual bool measure_operator_cost(Simulator *sim,
                                      MachineView const &mv,
@@ -311,7 +318,8 @@ class Op {
   ParallelTensor outputs[MAX_NUM_OUTPUTS];
   ParallelTensor inputs[MAX_NUM_INPUTS];
   ParallelParameter weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   OpMeta *meta[MAX_NUM_WORKERS];
   std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
   int numInputs, numWeights, numOutputs;
diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h
index d68957d890..f132003d30 100644
--- a/include/flexflow/ops/experts.h
+++ b/include/flexflow/ops/experts.h
@@ -138,7 +138,7 @@ class Experts : public Op {
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim);
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index bbebe3c79b..8f32cb2e83 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -36,6 +36,8 @@ class LinearMeta : public OpMeta {
   bool use_bias, add_bias_only_once;
   char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *output_activation_buffer;
 };
 
 namespace Kernels {
@@ -49,6 +51,14 @@ void forward_kernel_wrapper(LinearMeta const *m,
                             int in_dim,
                             int out_dim,
                             int batch_size);
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *kernel_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens);
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -74,6 +84,16 @@ void forward_kernel(LinearMeta const *m,
                     int batch_size,
                     ffStream_t stream);
 template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream);
+template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
                      void *input_grad_ptr,
@@ -86,6 +106,7 @@ void backward_kernel(LinearMeta const *m,
                      int out_dim,
                      int batch_size,
                      ffStream_t stream);
+
 template <typename DT>
 __global__ void build_one_ptr(DT *one_ptr, int batch_size);
 } // namespace Internal
diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h
index 025674c7ba..9b926bec6c 100644
--- a/include/flexflow/ops/linear.h
+++ b/include/flexflow/ops/linear.h
@@ -52,6 +52,11 @@ class Linear : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override;
   bool get_int_parameter(PMParameter, int *) const override;
   static Op *
@@ -66,6 +71,10 @@ class Linear : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 6e2da19ce9..a6a801d0ad 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -144,7 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   ~TreeIncMultiHeadSelfAttentionMeta(void);
 
 public:
-  int num_active_tokens;
+  int num_active_infr_tokens;
   Realm::RegionInstance committed_token_reserve_inst;
   TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos;
 };
diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h
index 8e50a4c3b3..888d172a96 100644
--- a/include/flexflow/utils/memory_allocator.h
+++ b/include/flexflow/utils/memory_allocator.h
@@ -54,6 +54,11 @@ class MemoryAllocator {
     return static_cast<DT *>(ptr);
   }
 
+  inline void free_all() {
+    reserved_allocated_size = 0;
+    instance_allocated_size = 0;
+  }
+
 public:
   Legion::Memory memory;
   void *reserved_ptr;
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index b877a9f96d..5aa34884f1 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -315,7 +315,7 @@ InferenceResult
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size);
 
   InferenceResult ir;
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index 7863931c82..e8e2bd7609 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -345,7 +345,7 @@ BeamInferenceResult
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
@@ -378,7 +378,7 @@ InferenceResult
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW parent;
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
   InferenceResult ir;
   download_tensor<BatchConfig::TokenId>(
@@ -429,4 +429,4 @@ size_t hash<FlexFlow::ArgMaxParams>::operator()(
   hash_combine(key, params.beam_search);
   return key;
 }
-}; // namespace std
\ No newline at end of file
+}; // namespace std
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index 93a6de5a8f..331f5c0d3d 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -389,7 +389,7 @@ BeamInferenceResult
   // total token nums
   // size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1;
   // size_t batch_size = in1_domain.get_volume() / length;
-  size_t batch_size = bc.num_active_tokens();
+  size_t batch_size = bc.num_active_infr_tokens();
   // std::vector<int> beam_width;
   // std::unordered_map<size_t, int> sub_requests = bc->sub_requests;
   // for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) {
diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc
index ce7b6ebc01..db2819e83c 100644
--- a/src/ops/conv_2d.cc
+++ b/src/ops/conv_2d.cc
@@ -592,7 +592,8 @@ OpMeta *Conv2D::init_task(Task const *task,
   m->relu = conv->activation == AC_MODE_RELU;
   m->use_bias = conv->use_bias;
   m->profiling = conv->profiling;
-  m->trainableInputs[0] = conv->trainableInputs[0];
+  m->trainable_inputs[0] = conv->trainable_inputs[0];
+  m->reset_input_grads[0] = conv->trainable_inputs[0];
   std::strcpy(m->op_name, conv->name);
 
   int input_w = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1;
@@ -751,7 +752,7 @@ void Conv2D::backward(FFModel const &ff) {
                                                     inputs[0]->region));
   launcher.add_field(rid++, FID_DATA);
   // regions[1](I/O): input_grad
-  if (trainableInputs[0]) {
+  if (trainable_inputs[0]) {
     launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                       0 /*projection id*/,
                                                       READ_WRITE,
@@ -801,7 +802,7 @@ void Conv2D::backward(FFModel const &ff) {
 
 /*
   region(I): input
-  region(I/O): input_grad (if trainableInputs[0])
+  region(I/O): input_grad (if trainable_inputs[0])
   region(I): output
   region(I/O): output_grad
   region(I): filter
@@ -814,17 +815,17 @@ void Conv2D::backward_task(Task const *task,
                            Runtime *runtime) {
   // Conv2D* conv = (Conv2D*) task->args;
   Conv2DMeta const *m = *((Conv2DMeta **)task->local_args);
-  assert(regions.size() == (5 + static_cast<size_t>(m->trainableInputs[0]) +
+  assert(regions.size() == (5 + static_cast<size_t>(m->trainable_inputs[0]) +
                             static_cast<size_t>(m->use_bias)));
   assert(task->regions.size() ==
-         (5 + static_cast<size_t>(m->trainableInputs[0]) +
+         (5 + static_cast<size_t>(m->trainable_inputs[0]) +
           static_cast<size_t>(m->use_bias)));
   size_t rid = 0;
   TensorAccessorR<float, Conv2DInput::NUMDIM> acc_input(
       regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
   rid++;
   float *acc_input_grad_ptr = NULL;
-  if (m->trainableInputs[0]) {
+  if (m->trainable_inputs[0]) {
     TensorAccessorW<float, Conv2DInput::NUMDIM> acc_input_grad(
         regions[rid],
         task->regions[rid],
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 21edad11e3..4f4b55178e 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -416,7 +416,7 @@ OpMeta *ElementBinary::init_task(Task const *task,
   FFHandler handle = *((FFHandler *)task->local_args);
   ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb);
   for (int i = 0; i < eb->numInputs; i++) {
-    m->trainableInputs[i] = eb->trainableInputs[i];
+    m->trainable_inputs[i] = eb->trainable_inputs[i];
   }
   m->op_type = eb->op_type;
   m->profiling = eb->profiling;
@@ -871,7 +871,7 @@ void ElementBinary::backward(FFModel const &ff) {
                                                       inputs[0]->region));
     launcher.add_field(rid++, FID_DATA);
     // regions[2](I/O): input0_grad
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       launcher.add_region_requirement(
           RegionRequirement(inputs[0]->part_grad,
                             0 /*projection id*/,
@@ -889,7 +889,7 @@ void ElementBinary::backward(FFModel const &ff) {
                                                         inputs[1]->region));
       launcher.add_field(rid++, FID_DATA);
       // regions[4](I/O): input1_grad
-      if (trainableInputs[1]) {
+      if (trainable_inputs[1]) {
         launcher.add_region_requirement(
             RegionRequirement(inputs[1]->part_grad,
                               0 /*projection id*/,
@@ -959,7 +959,7 @@ void ElementBinary::backward_task(Task const *task,
     in0_ptr = helperGetTensorPointerRO<float>(
         regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
     rid++;
-    if (m->trainableInputs[0]) {
+    if (m->trainable_inputs[0]) {
       Domain in0_grad_domain = runtime->get_index_space_domain(
           ctx, task->regions[rid].region.get_index_space());
       assert(in0_domain == in0_grad_domain);
@@ -977,7 +977,7 @@ void ElementBinary::backward_task(Task const *task,
       in1_ptr = helperGetTensorPointerRO<float>(
           regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
       rid++;
-      if (m->trainableInputs[1]) {
+      if (m->trainable_inputs[1]) {
         Domain in1_grad_domain = runtime->get_index_space_domain(
             ctx, task->regions[rid].region.get_index_space());
         // assert(out_grad_domain == in1_domain);
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index c8b0ec0f26..6ce5fe82d9 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -670,7 +670,7 @@ FutureMap Experts::inference(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  // int num_active_tokens = bc->num_active_tokens();
+  // int num_active_infr_tokens = bc->num_active_infr_tokens();
   IndexLauncher launcher(EXPERTS_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
@@ -1058,7 +1058,7 @@ void Experts::inference_task(Task const *task,
                                   output_ptr,
                                   weights_ptr,
                                   bias_ptr,
-                                  bc->num_active_tokens(),
+                                  bc->num_active_infr_tokens(),
                                   chosen_experts,
                                   batch_size,
                                   out_dim);
diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp
index c06f02a647..48536defd9 100644
--- a/src/ops/experts.cpp
+++ b/src/ops/experts.cpp
@@ -27,7 +27,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim) {
diff --git a/src/ops/experts.cu b/src/ops/experts.cu
index ce15cdff55..4e3ef6f12c 100644
--- a/src/ops/experts.cu
+++ b/src/ops/experts.cu
@@ -515,7 +515,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim) {
@@ -529,8 +529,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
     cudaEventRecord(t_start, stream);
   }
 
-  assert(num_active_tokens > 0);
-  assert(num_active_tokens <= m->effective_batch_size);
+  assert(num_active_infr_tokens > 0);
+  assert(num_active_infr_tokens <= m->effective_batch_size);
   assert(m->effective_batch_size == batch_size);
 
   int num_experts_per_block = m->num_experts;
@@ -540,7 +540,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int data_dim = m->data_dim;
   int num_chosen_experts = m->num_chosen_experts;
   // int num_tokens = m->effective_batch_size;
-  int num_tokens = num_active_tokens;
+  int num_tokens = num_active_infr_tokens;
   int expert_capacity = m->expert_capacity;
 
   assert(chosen_experts == num_chosen_experts);
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index f865c6dd2a..357b063a34 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -654,7 +654,7 @@ __host__ void
         }
         assert(m->input_type[0] == my_input_accessor[0].data_type);
         assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
+        batch_size = bc->num_active_infr_tokens();
         Kernels::Linear::forward_kernel_wrapper(m,
                                                 my_input_accessor[0].ptr,
                                                 my_output_accessor[0].ptr,
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 13927e8ee6..efe55f31ac 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -683,7 +683,7 @@ __host__ void
         }
         assert(m->input_type[0] == my_input_accessor[0].data_type);
         assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
+        batch_size = bc->num_active_infr_tokens();
         Kernels::Linear::forward_kernel_wrapper(m,
                                                 my_input_accessor[0].ptr,
                                                 my_output_accessor[0].ptr,
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 7cb9867312..ea0ba9b88d 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -906,7 +906,7 @@ void IncMultiHeadSelfAttention::inference_task(
 
   size_t effective_batch_size = max_sequence_length * batch_size;
   float inputs_arr[data_dim][effective_batch_size] = {0};
-  for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) {
+  for (size_t i = 0; i < data_dim * bc->num_active_infr_tokens(); i++) {
     size_t data_index = i % data_dim;
     size_t token_index = i / data_dim;
     assert(data_index < data_dim);
@@ -938,11 +938,11 @@ void IncMultiHeadSelfAttention::inference_task(
   // column-major order.
 
   // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, "
-  //     "bc->num_active_tokens(): %i, num_q_heads: %lli,
+  //     "bc->num_active_infr_tokens(): %i, num_q_heads: %lli,
   //     BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n",
-  //     m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(),
+  //     m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_infr_tokens(),
   //     num_q_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests());
-  // for (int t=0; t < bc->num_active_tokens(); t++) {
+  // for (int t=0; t < bc->num_active_infr_tokens(); t++) {
   //   printf("token %i has request_index: %li and token_position: %li\n",
   //   t, bc->token2ids.token_indexes[t].request_index,
   //   bc->token2ids.token_indexes[t].token_position);
@@ -1005,7 +1005,7 @@ void IncMultiHeadSelfAttention::inference_task(
   /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes()
             << std::endl;
   std::cout << "Torch input size: " << torch_input.sizes() << std::endl;
-  std::cout << "Number of active tokens: " << bc->num_active_tokens()
+  std::cout << "Number of active tokens: " << bc->num_active_infr_tokens()
             << std::endl; */
   // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl;
 
@@ -1017,10 +1017,10 @@ void IncMultiHeadSelfAttention::inference_task(
   torch::Tensor qkv_projs = torch::einsum(
       "ijkl,im->jmkl",
       {torch_w_qkv,
-       torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})});
+       torch_input.index({Slice(), Slice(0, bc->num_active_infr_tokens())})});
   // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl;
   assert(qkv_projs.sizes()[0] == m->qProjSize);
-  assert(qkv_projs.sizes()[1] == bc->num_active_tokens() &&
+  assert(qkv_projs.sizes()[1] == bc->num_active_infr_tokens() &&
          qkv_projs.sizes()[1] <= effective_batch_size);
   assert(qkv_projs.sizes()[2] == 3);
   assert(qkv_projs.sizes()[3] == num_q_heads);
@@ -1033,25 +1033,25 @@ void IncMultiHeadSelfAttention::inference_task(
   assert(QKVProjArray_cpu != nullptr);
 
   std::vector<int> QKVProjArray_converted_shape = {
-      m->qProjSize, bc->num_active_tokens(), 3, (int)num_q_heads};
+      m->qProjSize, bc->num_active_infr_tokens(), 3, (int)num_q_heads};
   float *QKVProjArray_converted = (float *)calloc(
-      m->qProjSize * bc->num_active_tokens() * 3 * num_q_heads, sizeof(float));
+      m->qProjSize * bc->num_active_infr_tokens() * 3 * num_q_heads, sizeof(float));
 
   // skip over padding at the end of QKVProjArray_cpu
   // convert from column order to 3D matrix because torch cannot automatically
   // import matrices flattened in column order
-  for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_q_heads;
+  for (size_t i = 0; i < proj_sum * bc->num_active_infr_tokens() * num_q_heads;
        i++) {
     int proj_size_index = i % m->qProjSize;
-    int head_index = i / (proj_sum * bc->num_active_tokens());
+    int head_index = i / (proj_sum * bc->num_active_infr_tokens());
     int token_index =
-        ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) %
-        bc->num_active_tokens();
-    int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) /
-                     (m->qProjSize * bc->num_active_tokens());
+        ((i - head_index * proj_sum * bc->num_active_infr_tokens()) / m->qProjSize) %
+        bc->num_active_infr_tokens();
+    int qkv_offset = (i - head_index * proj_sum * bc->num_active_infr_tokens()) /
+                     (m->qProjSize * bc->num_active_infr_tokens());
     assert(proj_size_index < proj_sum);
     assert(head_index < num_q_heads);
-    assert(token_index < bc->num_active_tokens());
+    assert(token_index < bc->num_active_infr_tokens());
     assert(qkv_offset < 3);
     set_value_row_major(QKVProjArray_converted,
                         QKVProjArray_converted_shape,
@@ -1060,7 +1060,7 @@ void IncMultiHeadSelfAttention::inference_task(
   }
   torch::Tensor QKVProjArray_torch =
       torch::from_blob(QKVProjArray_converted,
-                       {m->qProjSize, bc->num_active_tokens(), 3, num_q_heads},
+                       {m->qProjSize, bc->num_active_infr_tokens(), 3, num_q_heads},
                        torch::kFloat32);
 
   //  ----------------------- Comparing C++ & CUDA results ---------------------
@@ -1087,7 +1087,7 @@ void IncMultiHeadSelfAttention::inference_task(
   //  ----------------------- C++ operations & checks --------------------------
   // Store projections into k/v cache arrays
   for (size_t h = 0; h < num_q_heads; h++) {
-    for (size_t t = 0; t < bc->num_active_tokens(); t++) {
+    for (size_t t = 0; t < bc->num_active_infr_tokens(); t++) {
       for (size_t d = 0; d < m->kProjSize; d++) {
         size_t kcache_idx =
             d * MAX_SEQ_LEN * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS +
@@ -1124,7 +1124,7 @@ void IncMultiHeadSelfAttention::inference_task(
   std::vector<size_t> req_idxs;
   std::vector<size_t> r_first_idx;
   std::vector<size_t> r_num_tokens;
-  for (size_t t = 0; t < bc->num_active_tokens(); t++) {
+  for (size_t t = 0; t < bc->num_active_infr_tokens(); t++) {
     size_t rid = bc->tokensInfo[t].request_index;
     if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) {
       req_idxs.push_back(rid);
@@ -1140,7 +1140,7 @@ void IncMultiHeadSelfAttention::inference_task(
   assert(std::accumulate(r_num_tokens.begin(),
                          r_num_tokens.end(),
                          decltype(r_num_tokens)::value_type(0)) ==
-         bc->num_active_tokens());
+         bc->num_active_infr_tokens());
 
   //  ----------------------- Loading CUDA results for this step ---------------
   float *keyCache_cpu =
@@ -1375,7 +1375,7 @@ void IncMultiHeadSelfAttention::inference_task(
   torch::Tensor attn_heads[bc->num_active_requests()];
 
   torch::Tensor cpp_output =
-      torch::zeros({m->oProjSize, bc->num_active_tokens()});
+      torch::zeros({m->oProjSize, bc->num_active_infr_tokens()});
 
   //  ----------------------- Loading CUDA results for this step ---------------
   float *qk_prods_cpu = download_tensor<float>(
@@ -1595,12 +1595,12 @@ void IncMultiHeadSelfAttention::inference_task(
   std::cout << "CUDA:" <<std::endl;
   for (int i=0; i<m->oProjSize; i++) {
     std::cout << torch_out_cuda.index({i, Slice(0,
-  (int64_t)bc->num_active_tokens())}) << std::endl;
+  (int64_t)bc->num_active_infr_tokens())}) << std::endl;
   } */
 
   assert(torch::allclose(
       torch_out_cuda.index(
-          {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}),
+          {Slice(), Slice(0, (int64_t)bc->num_active_infr_tokens())}),
       cpp_output,
       1e-05,
       1e-05));
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 8fb635bace..98a101b723 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -305,7 +305,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   int m_k = m->kProjSize;
   int m_v = m->vProjSize;
   assert(m_q == m_k && m_k == m_v); // keep things simple for now
-  int n = bc->num_active_tokens();
+  int n = bc->num_active_infr_tokens();
   int k = m->qSize;
   int m_ = m_q;
   int lda = k, ldb = k, ldc = m_q;
@@ -342,7 +342,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
   // apply rotary emmmbedding for q and k
   // step1 change the k, v to complex tensor
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   int q_block_size = m->qProjSize * num_tokens;
   int k_block_size = m->kProjSize * num_tokens;
@@ -407,7 +407,7 @@ template <typename DT>
 void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             BatchConfig const *bc,
                             hipStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   if (num_tokens > 0) {
     int parallelism =
         (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads;
@@ -508,7 +508,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
   checkCUDA(hipMemcpyAsync(m->token_infos,
                            &(bc->tokensInfo),
-                           bc->num_active_tokens() *
+                           bc->num_active_infr_tokens() *
                                sizeof(BatchConfig::PerTokenInfo),
                            hipMemcpyHostToDevice,
                            stream));
@@ -573,7 +573,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
   int q_block_size = m->qProjSize * num_tokens;
   int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ec776f4cda..710d20240b 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -281,7 +281,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   int m_k = m->kProjSize;
   int m_v = m->vProjSize;
   assert(m_q == m_k && m_k == m_v); // keep things simple for now
-  int n = bc->num_active_tokens();
+  int n = bc->num_active_infr_tokens();
   int k = m->qSize;
   int m_ = m_q;
   int lda = k, ldb = k, ldc = m_q;
@@ -317,7 +317,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   // apply rotary emmmbedding for q and k
   // step1 change the k, v to complex tensor
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   int q_block_size = m->qProjSize * num_tokens;
   int k_block_size = m->kProjSize * num_tokens;
@@ -376,7 +376,7 @@ template <typename DT>
 void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             BatchConfig const *bc,
                             cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   if (num_tokens > 0) {
     int parallelism =
         (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads;
@@ -475,7 +475,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
   cudaMemcpyAsync(m->token_infos,
                   &(bc->tokensInfo),
-                  bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo),
+                  bc->num_active_infr_tokens() * sizeof(BatchConfig::PerTokenInfo),
                   cudaMemcpyHostToDevice,
                   stream);
   // phase 1: Implement kernel to compute KQV for input tokens
@@ -576,7 +576,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
   int q_block_size = m->qProjSize * num_tokens;
   int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH;
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 231ca0f3d7..5f756c8f5c 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -143,6 +143,40 @@ void forward_kernel_wrapper(LinearMeta const *m,
   }
 }
 
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *weight_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+}
+
+
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -317,6 +351,69 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+
+  DT alpha = 1.0f;
+  hipDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  // update input_grad_ptr offset
+  input_grad_ptr = static_cast<DT*>(input_grad_ptr) + num_infr_tokens;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  hipblasDatatype_t compute_type = HIPBLAS_R_32F;
+#endif
+  int output_size = out_dim * num_peft_tokens;
+  if (m->activation == AC_MODE_RELU) {
+    relu_backward_kernel(
+        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+  } else if (m->activation == AC_MODE_SIGMOID) {
+    sigmoid_backward_kernel(
+        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+  } else {
+    // TODO: only support relu and sigmoid for now
+    assert(m->activation == AC_MODE_NONE);
+  }
+
+  // Compute data gradiant
+  // NOTE: we use alpha=1 for input_grad to accumulate gradients
+  if (input_grad_ptr != NULL) {
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           in_dim,
+                           num_peft_tokens,
+                           out_dim,
+                           &alpha,
+                           kernel_ptr,
+                           weight_type,
+                           in_dim,
+                           output_grad_ptr,
+                           output_type,
+                           out_dim,
+                           &alpha,
+                           input_grad_ptr,
+                           input_type,
+                           in_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
+
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 8a93357dcf..4ac6bc253f 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -170,6 +170,61 @@ void forward_kernel_wrapper(LinearMeta const *m,
   }
 }
 
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *weight_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
+  }
+}
+
+
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -380,6 +435,68 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  DT alpha = 1.0f;
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  // update input_grad_ptr offset
+  input_grad_ptr = static_cast<DT*>(input_grad_ptr) + num_infr_tokens;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = CUDA_R_32F;
+#endif
+  int output_size = out_dim * num_peft_tokens;
+  if (m->activation == AC_MODE_RELU) {
+    relu_backward_kernel(
+        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+  } else if (m->activation == AC_MODE_SIGMOID) {
+    sigmoid_backward_kernel(
+        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+  } else {
+    // TODO: only support relu and sigmoid for now
+    assert(m->activation == AC_MODE_NONE);
+  }
+
+  // Compute data gradiant
+  // NOTE: we use alpha=1 for input_grad to accumulate gradients
+  if (input_grad_ptr != NULL) {
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           in_dim,
+                           num_peft_tokens,
+                           out_dim,
+                           &alpha,
+                           kernel_ptr,
+                           weight_type,
+                           in_dim,
+                           output_grad_ptr,
+                           output_type,
+                           out_dim,
+                           &alpha,
+                           input_grad_ptr,
+                           input_type,
+                           in_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index a751ebcc57..f6de5186ad 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -504,7 +504,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
   m->use_bias = linear->use_bias;
   m->add_bias_only_once = linear->add_bias_only_once;
   m->profiling = linear->profiling;
-  m->trainableInputs[0] = linear->trainableInputs[0];
+  m->trainable_inputs[0] = linear->trainable_inputs[0];
   m->weight_ptr_type = m->input_type[0];
   m->quantization_type = linear->quantization_type;
   m->offload = linear->offload;
@@ -638,7 +638,7 @@ void Linear::inference_task(Task const *task,
   int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   GenericTensorAccessorR bias;
   if (m->use_bias &&
       !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) {
@@ -660,6 +660,99 @@ void Linear::inference_task(Task const *task,
                          batch_size);
 }
 
+FutureMap Linear::peft_bwd(FFModel const &ff,
+                           BatchConfigFuture const &bc,
+                           std::vector<ParallelTensor> const &batch_inputs,
+                           std::vector<ParallelTensor> const &batch_outputs,
+                           MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(2, FID_DATA);
+  if (use_bias) {
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
+    launcher.add_field(3, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void Linear::peft_bwd_task(Task const *task,
+                           std::vector<PhysicalRegion> const &regions,
+                           Context ctx,
+                           Runtime *runtime) {
+  Domain input_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  LinearMeta const *m = *((LinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
+    return;
+  }
+  assert(regions.size() == (3 + static_cast<size_t>(m->use_bias)));
+  assert(task->regions.size() == (3 + static_cast<size_t>(m->use_bias)));
+  if (m->quantization_type == DT_NONE) {
+    assert(m->input_type[0] == m->weight_type[0]);
+  }
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+
+  int num_infr_tokens = bc->num_active_infr_tokens();
+  int num_peft_tokens = bc->num_active_peft_tokens();
+  peft_bwd_kernel_wrapper(m,
+                          input_grad.ptr,
+                          output_grad.ptr,
+                          weight.ptr,
+                          in_dim,
+                          out_dim,
+                          num_infr_tokens,
+                          num_peft_tokens);
+}
+
 void Linear::forward_task(Task const *task,
                           std::vector<PhysicalRegion> const &regions,
                           Context ctx,
@@ -775,7 +868,7 @@ void Linear::backward(FFModel const &ff) {
     launcher.add_field(rid++, FID_DATA);
     // regions[1](I/O): replica_grad
     assert(replica == NULL);
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       launcher.add_region_requirement(
           RegionRequirement(inputs[0]->part_grad,
                             0 /*projection id*/,
@@ -871,17 +964,17 @@ void Linear::backward_task_with_dim(Task const *task,
                                     Runtime *runtime) {
   // Linear* linear = (Linear*) task->args;
   LinearMeta const *m = *((LinearMeta **)task->local_args);
-  assert(regions.size() == (5 + static_cast<size_t>(m->trainableInputs[0]) +
+  assert(regions.size() == (5 + static_cast<size_t>(m->trainable_inputs[0]) +
                             static_cast<size_t>(m->use_bias)));
   assert(task->regions.size() ==
-         (5 + static_cast<size_t>(m->trainableInputs[0]) +
+         (5 + static_cast<size_t>(m->trainable_inputs[0]) +
           static_cast<size_t>(m->use_bias)));
   DT *input_grad = nullptr;
   size_t rid = 0;
   TensorAccessorR<DT, NDIM> acc_input(
       regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
   rid++;
-  if (m->trainableInputs[0]) {
+  if (m->trainable_inputs[0]) {
     Domain domain = runtime->get_index_space_domain(
         ctx, task->regions[rid].region.get_index_space());
     if (domain.get_dim() == NDIM + 1) {
@@ -1157,7 +1250,7 @@ bool Linear::measure_operator_cost(Simulator *sim,
   };
   if (sim->computationMode == COMP_MODE_TRAINING) {
     void *input_grad_ptr = NULL;
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       input_grad_ptr =
           sim->allocate(sub_input.get_volume(), inputs[0]->data_type);
     } else {
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index 6eb62b2933..f597b9b6b0 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -299,7 +299,7 @@ InferenceResult
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   Sampling::forward_kernel_wrapper(m, input, indices, batch_size);
 
   InferenceResult ir;
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index f983238198..3b2b44401e 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -150,7 +150,7 @@ template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             BeamSearchBatchConfig const *bc,
                             hipStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   // printf("curr depth: %d\n", curr_depth);
   // assert(curr_depth < 3);
@@ -218,7 +218,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
   // int qkv_block_size =
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 6ef5145654..2e9a558d6f 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -168,7 +168,7 @@ template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             BeamSearchBatchConfig const *bc,
                             cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   // printf("curr depth: %d\n", curr_depth);
   // assert(curr_depth < 3);
@@ -234,7 +234,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
   // int qkv_block_size =
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 0fa68bed08..755466a727 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -41,7 +41,7 @@ __global__ void commit_tokens_kernel(
     int kProjSize,
     int vProjSize,
     int num_tokens_to_commit,
-    int num_active_tokens_in_last_batch,
+    int num_active_infr_tokens_in_last_batch,
     int num_q_heads,
     int num_kv_heads,
     int max_seq_len) {
@@ -58,16 +58,16 @@ __global__ void commit_tokens_kernel(
     int token_pos =
         (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size;
     int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
-    assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
+    assert(token_idx_in_last_batch < num_active_infr_tokens_in_last_batch);
 
     int q_array_size =
-        qProjSize * num_active_tokens_in_last_batch * num_q_heads;
+        qProjSize * num_active_infr_tokens_in_last_batch * num_q_heads;
     int k_array_size =
-        kProjSize * num_active_tokens_in_last_batch * num_kv_heads;
+        kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads;
 
     DT val =
         devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) +
-                        head_idx * proj_size * num_active_tokens_in_last_batch +
+                        head_idx * proj_size * num_active_infr_tokens_in_last_batch +
                         token_idx_in_last_batch * proj_size + data_idx];
     int const req_id = committedTokenInfos[token_pos].request_index;
     int const tok_id = committedTokenInfos[token_pos].token_depth;
@@ -101,7 +101,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kProjSize,
         m->vProjSize,
         num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
+        m->num_active_infr_tokens, // number of active tokens in previous batch
         m->num_q_heads,
         m->num_kv_heads,
         BatchConfig::MAX_SEQ_LENGTH);
@@ -193,8 +193,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize * bc->num_active_tokens();
+  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens();
+  int q_block_size = m->qProjSize * bc->num_active_infr_tokens();
   int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH;
   int kt_req_block_size = kt_block_size * m->num_kv_heads;
   int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH;
@@ -238,7 +238,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens,      // total_tokens_in_batch
             m->num_q_heads,
             m->num_kv_heads,
             BatchConfig::MAX_SEQ_LENGTH);
@@ -517,7 +517,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                        m->oProjSize);
   }
 
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
+  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
 }
 
 template <typename DT>
@@ -546,7 +546,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     }
   }
   // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
+  // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
   checkCUDA(
@@ -558,9 +558,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                      stream));
   commit_tokens<DT>(m, bc, stream);
 
-  // After commit we update m->num_active_tokens to be the number of active
+  // After commit we update m->num_active_infr_tokens to be the number of active
   // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
+  m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
   // here because we need postion info in infernece 1
   if (m->offload && m->biasSize > 0) {
@@ -707,7 +707,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_kv_heads,
                                     attn->quantization_type,
                                     attn->offload),
-      num_active_tokens(0) {
+      num_active_infr_tokens(0) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 95ac93ad8a..30ed4e54eb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -41,7 +41,7 @@ __global__ void commit_tokens_kernel(
     int kProjSize,
     int vProjSize,
     int num_tokens_to_commit,
-    int num_active_tokens_in_last_batch,
+    int num_active_infr_tokens_in_last_batch,
     int num_q_heads,
     int num_kv_heads,
     int max_seq_len) {
@@ -58,16 +58,16 @@ __global__ void commit_tokens_kernel(
     int token_pos =
         (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size;
     int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
-    assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
+    assert(token_idx_in_last_batch < num_active_infr_tokens_in_last_batch);
 
     int q_array_size =
-        qProjSize * num_active_tokens_in_last_batch * num_q_heads;
+        qProjSize * num_active_infr_tokens_in_last_batch * num_q_heads;
     int k_array_size =
-        kProjSize * num_active_tokens_in_last_batch * num_kv_heads;
+        kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads;
 
     DT val =
         devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) +
-                        head_idx * proj_size * num_active_tokens_in_last_batch +
+                        head_idx * proj_size * num_active_infr_tokens_in_last_batch +
                         token_idx_in_last_batch * proj_size + data_idx];
     int const req_id = committedTokenInfos[token_pos].request_index;
     int const tok_id = committedTokenInfos[token_pos].token_depth;
@@ -99,7 +99,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kProjSize,
         m->vProjSize,
         num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
+        m->num_active_infr_tokens, // number of active tokens in previous batch
         m->num_q_heads,
         m->num_kv_heads,
         BatchConfig::MAX_SEQ_LENGTH);
@@ -191,8 +191,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize * bc->num_active_tokens();
+  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens();
+  int q_block_size = m->qProjSize * bc->num_active_infr_tokens();
   int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH;
   int kt_req_block_size = kt_block_size * m->num_kv_heads;
   int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH;
@@ -234,7 +234,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens,      // total_tokens_in_batch
             m->num_q_heads,
             m->num_kv_heads,
             BatchConfig::MAX_SEQ_LENGTH);
@@ -515,7 +515,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                   m->oProjSize);
   }
 
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
+  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
 }
 
 template <typename DT>
@@ -544,7 +544,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     }
   }
   // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
+  // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
   cudaMemcpyAsync(m->committed_token_infos,
@@ -555,9 +555,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                   stream);
   commit_tokens<DT>(m, bc, stream);
 
-  // After commit we update m->num_active_tokens to be the number of active
+  // After commit we update m->num_active_infr_tokens to be the number of active
   // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
+  m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
   // here because we need postion info in infernece 1
   if (m->offload && m->biasSize > 0) {
@@ -704,7 +704,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_kv_heads,
                                     attn->quantization_type,
                                     attn->offload),
-      num_active_tokens(0) {
+      num_active_infr_tokens(0) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index d658b6590f..4da520ea97 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -24,7 +24,7 @@ LegionRuntime::Logger::Category log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_tokens(0) {
+BatchConfig::BatchConfig() : num_infr_tokens(0), num_peft_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
     requestsInfo[i].token_start_offset = 0;
     requestsInfo[i].num_tokens_in_batch = 0;
@@ -68,8 +68,12 @@ int BatchConfig::num_active_requests() const {
   return num_requests;
 }
 
-int BatchConfig::num_active_tokens() const {
-  return num_tokens;
+int BatchConfig::num_active_infr_tokens() const {
+  return num_infr_tokens;
+}
+
+int BatchConfig::num_active_peft_tokens() const {
+  return num_peft_tokens;
 }
 
 void BatchConfig::print() const {
@@ -77,7 +81,8 @@ void BatchConfig::print() const {
             << ") @@@@@@@@@@@@@@" << std::endl;
   std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl;
   std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl;
-  std::cout << "Number of tokens: " << num_tokens << std::endl;
+  std::cout << "Number of infr tokens: " << num_infr_tokens << std::endl;
+  std::cout << "Number of peft tokens: " << num_peft_tokens << std::endl;
   std::cout << "Number of requests: " << num_active_requests() << std::endl;
   // std::cout << "Cached results: " << cached_results << std::endl;
 
@@ -98,7 +103,7 @@ void BatchConfig::print() const {
   }
 
   std::cout << "Per-token info:\n";
-  for (int i = 0; i < num_tokens; i++) {
+  for (int i = 0; i < num_infr_tokens + num_peft_tokens; i++) {
     std::cout << "  Token " << i << ":\n";
     std::cout << "    Absolute depth in request: "
               << tokensInfo[i].abs_depth_in_request << std::endl;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index f36dcb2922..dc1a9f6611 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -292,11 +292,11 @@ FutureMap InferenceManager::inference(FFModel *model,
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfigFuture const &bc) {
-  // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)",
+  // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) num_active_requests(%d)",
   //                   bc.get_mode(),
-  //                   bc.num_active_tokens(),
+  //                   bc.num_active_infr_tokens(),
   //                   bc.num_active_requests());
-  //  assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0);
+  //  assert(bc.num_active_infr_tokens() > 0 && bc.num_active_requests() > 0);
   //  We currently assume that the index-th batch will be placed
   //  on the device_index-th device (except for the experts layers)
   int batch_index = index % model->config.data_parallelism_degree;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 0cb50733a3..48fe5c4fe8 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -144,8 +144,8 @@ Op::Op(FFModel &model,
     inputs[i] = tensors[i];
   }
   for (int i = 0; i < numInputs; i++) {
-    trainableInputs[i] = true;
-    // resetInputGrads[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_OUTPUTS; i++) {
     outputs[i] = nullptr;
@@ -188,8 +188,8 @@ Op::Op(FFModel &model,
     }
   }
   for (int i = 0; i < numInputs; i++) {
-    trainableInputs[i] = true;
-    // resetInputGrads[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_OUTPUTS; i++) {
     outputs[i] = NULL;
@@ -1463,7 +1463,8 @@ bool Op::get_weight_parameter(TNParameter tnp,
 
 OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false) {
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
-    trainableInputs[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
     input_type[i] = DT_NONE;
@@ -3447,7 +3448,7 @@ void FFModel::compile(LossType loss_type,
     for (int i = 0; i < op->numInputs; i++) {
       assert(op->inputs[i]->owner_op != nullptr);
       if (op->inputs[i]->owner_op->op_type == OP_INPUT) {
-        op->trainableInputs[i] = false;
+        op->trainable_inputs[i] = false;
       }
     }
   }
@@ -5364,6 +5365,20 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Linear::inference_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, "Linear PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Linear::peft_bwd_task>(
+          registrar, "Linear PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Linear::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));

From 60702fc74309a9c446f7ab78abc50e112e16831a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 27 Sep 2023 10:45:44 -0400
Subject: [PATCH 002/198] format

---
 include/flexflow/config.h                     |  2 +-
 include/flexflow/layer.h                      |  2 +-
 include/flexflow/operator.h                   |  2 +-
 src/ops/inc_multihead_self_attention.cc       | 24 +++++----
 src/ops/inc_multihead_self_attention.cu       |  3 +-
 src/ops/kernels/linear_kernels.cpp            | 54 ++++++++++---------
 src/ops/kernels/linear_kernels.cu             | 17 +++---
 src/ops/tree_inc_multihead_self_attention.cpp | 13 ++---
 src/ops/tree_inc_multihead_self_attention.cu  | 13 ++---
 src/runtime/batch_config.cc                   | 11 ++--
 src/runtime/inference_manager.cc              |  3 +-
 src/runtime/model.cc                          |  3 +-
 12 files changed, 82 insertions(+), 65 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index e670bd72fb..1d74a38468 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -81,7 +81,7 @@ struct FFHandler {
   // PEFT related fields
   void *peft_activation_reserve_space;
   size_t peft_activation_reserve_space_size;
-  MemoryAllocator* peft_activation_allocator;
+  MemoryAllocator *peft_activation_allocator;
   // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 68d292dfe0..9865501f5f 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -49,7 +49,7 @@ class Layer {
   Tensor outputs[MAX_NUM_OUTPUTS];
   Tensor inputs[MAX_NUM_INPUTS];
   Tensor weights[MAX_NUM_WEIGHTS];
-  //bool trainable_inputs[MAX_NUM_INPUTS];
+  // bool trainable_inputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
 
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index cce92a6bd8..32e66e4e72 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -224,7 +224,7 @@ class Op {
                                       MachineView const *mv = nullptr) {
     assert(false);
   };
-  virtual Legion::FutureMap peft_bwd(FFModel const&,
+  virtual Legion::FutureMap peft_bwd(FFModel const &,
                                      BatchConfigFuture const &,
                                      std::vector<ParallelTensor> const &,
                                      std::vector<ParallelTensor> const &,
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index ea0ba9b88d..1484c424bb 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -940,8 +940,9 @@ void IncMultiHeadSelfAttention::inference_task(
   // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, "
   //     "bc->num_active_infr_tokens(): %i, num_q_heads: %lli,
   //     BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n",
-  //     m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_infr_tokens(),
-  //     num_q_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests());
+  //     m->kProjSize, BatchConfig::MAX_NUM_TOKENS,
+  //     bc->num_active_infr_tokens(), num_q_heads,
+  //     BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests());
   // for (int t=0; t < bc->num_active_infr_tokens(); t++) {
   //   printf("token %i has request_index: %li and token_position: %li\n",
   //   t, bc->token2ids.token_indexes[t].request_index,
@@ -1035,7 +1036,8 @@ void IncMultiHeadSelfAttention::inference_task(
   std::vector<int> QKVProjArray_converted_shape = {
       m->qProjSize, bc->num_active_infr_tokens(), 3, (int)num_q_heads};
   float *QKVProjArray_converted = (float *)calloc(
-      m->qProjSize * bc->num_active_infr_tokens() * 3 * num_q_heads, sizeof(float));
+      m->qProjSize * bc->num_active_infr_tokens() * 3 * num_q_heads,
+      sizeof(float));
 
   // skip over padding at the end of QKVProjArray_cpu
   // convert from column order to 3D matrix because torch cannot automatically
@@ -1045,10 +1047,12 @@ void IncMultiHeadSelfAttention::inference_task(
     int proj_size_index = i % m->qProjSize;
     int head_index = i / (proj_sum * bc->num_active_infr_tokens());
     int token_index =
-        ((i - head_index * proj_sum * bc->num_active_infr_tokens()) / m->qProjSize) %
+        ((i - head_index * proj_sum * bc->num_active_infr_tokens()) /
+         m->qProjSize) %
         bc->num_active_infr_tokens();
-    int qkv_offset = (i - head_index * proj_sum * bc->num_active_infr_tokens()) /
-                     (m->qProjSize * bc->num_active_infr_tokens());
+    int qkv_offset =
+        (i - head_index * proj_sum * bc->num_active_infr_tokens()) /
+        (m->qProjSize * bc->num_active_infr_tokens());
     assert(proj_size_index < proj_sum);
     assert(head_index < num_q_heads);
     assert(token_index < bc->num_active_infr_tokens());
@@ -1058,10 +1062,10 @@ void IncMultiHeadSelfAttention::inference_task(
                         {proj_size_index, token_index, qkv_offset, head_index},
                         QKVProjArray_cpu[i]);
   }
-  torch::Tensor QKVProjArray_torch =
-      torch::from_blob(QKVProjArray_converted,
-                       {m->qProjSize, bc->num_active_infr_tokens(), 3, num_q_heads},
-                       torch::kFloat32);
+  torch::Tensor QKVProjArray_torch = torch::from_blob(
+      QKVProjArray_converted,
+      {m->qProjSize, bc->num_active_infr_tokens(), 3, num_q_heads},
+      torch::kFloat32);
 
   //  ----------------------- Comparing C++ & CUDA results ---------------------
   // std::cout << "QKVProjArray_torch" << std::endl;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 710d20240b..a3061c4c8e 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -475,7 +475,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
   cudaMemcpyAsync(m->token_infos,
                   &(bc->tokensInfo),
-                  bc->num_active_infr_tokens() * sizeof(BatchConfig::PerTokenInfo),
+                  bc->num_active_infr_tokens() *
+                      sizeof(BatchConfig::PerTokenInfo),
                   cudaMemcpyHostToDevice,
                   stream);
   // phase 1: Implement kernel to compute KQV for input tokens
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 5f756c8f5c..87b39126c5 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -176,7 +176,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
   }
 }
 
-
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -369,7 +368,7 @@ void peft_bwd_kernel(LinearMeta const *m,
   hipDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   // update input_grad_ptr offset
-  input_grad_ptr = static_cast<DT*>(input_grad_ptr) + num_infr_tokens;
+  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens;
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
@@ -378,11 +377,17 @@ void peft_bwd_kernel(LinearMeta const *m,
 #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
-    relu_backward_kernel(
-        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+    relu_backward_kernel(m->output_type[0],
+                         output_grad_ptr,
+                         m->output_activation_buffer,
+                         output_size,
+                         stream);
   } else if (m->activation == AC_MODE_SIGMOID) {
-    sigmoid_backward_kernel(
-        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+    sigmoid_backward_kernel(m->output_type[0],
+                            output_grad_ptr,
+                            m->output_activation_buffer,
+                            output_size,
+                            stream);
   } else {
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
@@ -392,28 +397,27 @@ void peft_bwd_kernel(LinearMeta const *m,
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(hipblasGemmEx(m->handle.blas,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_N,
-                           in_dim,
-                           num_peft_tokens,
-                           out_dim,
-                           &alpha,
-                           kernel_ptr,
-                           weight_type,
-                           in_dim,
-                           output_grad_ptr,
-                           output_type,
-                           out_dim,
-                           &alpha,
-                           input_grad_ptr,
-                           input_type,
-                           in_dim,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_N,
+                            in_dim,
+                            num_peft_tokens,
+                            out_dim,
+                            &alpha,
+                            kernel_ptr,
+                            weight_type,
+                            in_dim,
+                            output_grad_ptr,
+                            output_type,
+                            out_dim,
+                            &alpha,
+                            input_grad_ptr,
+                            input_type,
+                            in_dim,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
 }
 
-
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 4ac6bc253f..0f60bfe17b 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -224,7 +224,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
   }
 }
 
-
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -453,7 +452,7 @@ void peft_bwd_kernel(LinearMeta const *m,
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   // update input_grad_ptr offset
-  input_grad_ptr = static_cast<DT*>(input_grad_ptr) + num_infr_tokens;
+  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens;
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
@@ -462,11 +461,17 @@ void peft_bwd_kernel(LinearMeta const *m,
 #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
-    relu_backward_kernel(
-        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+    relu_backward_kernel(m->output_type[0],
+                         output_grad_ptr,
+                         m->output_activation_buffer,
+                         output_size,
+                         stream);
   } else if (m->activation == AC_MODE_SIGMOID) {
-    sigmoid_backward_kernel(
-        m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream);
+    sigmoid_backward_kernel(m->output_type[0],
+                            output_grad_ptr,
+                            m->output_activation_buffer,
+                            output_size,
+                            stream);
   } else {
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 755466a727..9866cc11d6 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -65,10 +65,10 @@ __global__ void commit_tokens_kernel(
     int k_array_size =
         kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads;
 
-    DT val =
-        devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) +
-                        head_idx * proj_size * num_active_infr_tokens_in_last_batch +
-                        token_idx_in_last_batch * proj_size + data_idx];
+    DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) +
+                             head_idx * proj_size *
+                                 num_active_infr_tokens_in_last_batch +
+                             token_idx_in_last_batch * proj_size + data_idx];
     int const req_id = committedTokenInfos[token_pos].request_index;
     int const tok_id = committedTokenInfos[token_pos].token_depth;
 
@@ -193,7 +193,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens();
+  //     (m->qProjSize + m->kProjSize + m->vProjSize) *
+  //     bc->num_active_infr_tokens();
   int q_block_size = m->qProjSize * bc->num_active_infr_tokens();
   int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH;
   int kt_req_block_size = kt_block_size * m->num_kv_heads;
@@ -238,7 +239,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_infr_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             m->num_q_heads,
             m->num_kv_heads,
             BatchConfig::MAX_SEQ_LENGTH);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 30ed4e54eb..adff421e86 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -65,10 +65,10 @@ __global__ void commit_tokens_kernel(
     int k_array_size =
         kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads;
 
-    DT val =
-        devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) +
-                        head_idx * proj_size * num_active_infr_tokens_in_last_batch +
-                        token_idx_in_last_batch * proj_size + data_idx];
+    DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) +
+                             head_idx * proj_size *
+                                 num_active_infr_tokens_in_last_batch +
+                             token_idx_in_last_batch * proj_size + data_idx];
     int const req_id = committedTokenInfos[token_pos].request_index;
     int const tok_id = committedTokenInfos[token_pos].token_depth;
 
@@ -191,7 +191,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens();
+  //     (m->qProjSize + m->kProjSize + m->vProjSize) *
+  //     bc->num_active_infr_tokens();
   int q_block_size = m->qProjSize * bc->num_active_infr_tokens();
   int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH;
   int kt_req_block_size = kt_block_size * m->num_kv_heads;
@@ -234,7 +235,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_infr_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             m->num_q_heads,
             m->num_kv_heads,
             BatchConfig::MAX_SEQ_LENGTH);
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 4da520ea97..0015d958d5 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -24,7 +24,7 @@ LegionRuntime::Logger::Category log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_infr_tokens(0), num_peft_tokens(0) {
+BatchConfig::BatchConfig() : num_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
     requestsInfo[i].token_start_offset = 0;
     requestsInfo[i].num_tokens_in_batch = 0;
@@ -69,11 +69,11 @@ int BatchConfig::num_active_requests() const {
 }
 
 int BatchConfig::num_active_infr_tokens() const {
-  return num_infr_tokens;
+  return num_tokens;
 }
 
 int BatchConfig::num_active_peft_tokens() const {
-  return num_peft_tokens;
+  return 0;
 }
 
 void BatchConfig::print() const {
@@ -81,8 +81,7 @@ void BatchConfig::print() const {
             << ") @@@@@@@@@@@@@@" << std::endl;
   std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl;
   std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl;
-  std::cout << "Number of infr tokens: " << num_infr_tokens << std::endl;
-  std::cout << "Number of peft tokens: " << num_peft_tokens << std::endl;
+  std::cout << "Number of infr tokens: " << num_tokens << std::endl;
   std::cout << "Number of requests: " << num_active_requests() << std::endl;
   // std::cout << "Cached results: " << cached_results << std::endl;
 
@@ -103,7 +102,7 @@ void BatchConfig::print() const {
   }
 
   std::cout << "Per-token info:\n";
-  for (int i = 0; i < num_infr_tokens + num_peft_tokens; i++) {
+  for (int i = 0; i < num_tokens; i++) {
     std::cout << "  Token " << i << ":\n";
     std::cout << "    Absolute depth in request: "
               << tokensInfo[i].abs_depth_in_request << std::endl;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index dc1a9f6611..584b8cab4c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -292,7 +292,8 @@ FutureMap InferenceManager::inference(FFModel *model,
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfigFuture const &bc) {
-  // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) num_active_requests(%d)",
+  // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d)
+  // num_active_requests(%d)",
   //                   bc.get_mode(),
   //                   bc.num_active_infr_tokens(),
   //                   bc.num_active_requests());
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2f457cccf5..e1a40ca991 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5406,7 +5406,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, "Linear PEFT Backward");
+    TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID,
+                                   "Linear PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {

From da9ce1be7ef9ad2ae624d0988f094a8feee4713a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 3 Oct 2023 17:57:38 -0400
Subject: [PATCH 003/198] implement LoraLinear

---
 include/flexflow/batch_config.h         |   1 +
 include/flexflow/ffconst.h              |   2 +
 include/flexflow/model.h                |  17 +
 include/flexflow/operator_params.h      |   2 +
 include/flexflow/ops/lora_linear.h      | 112 +++++
 src/ops/inc_multihead_self_attention.cc |   4 +-
 src/ops/kernels/linear_kernels.cu       |   6 +-
 src/ops/kernels/lora_linear_kernels.cu  | 373 +++++++++++++++
 src/ops/lora_linear.cc                  | 599 ++++++++++++++++++++++++
 src/runtime/batch_config.cc             |   4 +
 src/runtime/model.cc                    |  49 ++
 11 files changed, 1166 insertions(+), 3 deletions(-)
 create mode 100644 include/flexflow/ops/lora_linear.h
 create mode 100644 src/ops/kernels/lora_linear_kernels.cu
 create mode 100644 src/ops/lora_linear.cc

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 427b2ec3ec..fc243fb365 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -42,6 +42,7 @@ class BatchConfig {
   using TokenId = int;
   BatchConfig();
   int num_active_requests() const;
+  int num_active_tokens() const;
   int num_active_infr_tokens() const;
   int num_active_peft_tokens() const;
   static int max_requests_per_batch();
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 124b46862a..37a178d952 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -172,6 +172,8 @@ enum OperatorType {
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
+  // PEFT Ops
+  OP_LORA_LINEAR,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 526332340b..105c678ba9 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -120,6 +120,9 @@ enum TaskIDs {
   LINEAR_BWD_TASK_ID,
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
+  LORA_LINEAR_INIT_TASK_ID,
+  LORA_LINEAR_INF_TASK_ID,
+  LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,
   FLAT_FWD_TASK_ID,
   FLAT_BWD_TASK_ID,
@@ -322,6 +325,7 @@ class ResidualLayerNorm;
 class AddBiasResidualLayerNorm;
 class SigmoidSiluMulti;
 class Linear;
+class LoraLinear;
 class MultiHeadAttention;
 class IncMultiHeadSelfAttention;
 class TreeIncMultiHeadSelfAttention;
@@ -801,6 +805,15 @@ class FFModel {
       bool position_bias = false,
       char const *name = NULL);
   // ========================================
+  // PEFT Layers
+  // ========================================
+  void lora_linear(Tensor const input,
+                   Tensor const output,
+                   int rank,
+                   DataType data_type = DT_NONE,
+                   Initializer *kernel_initializer = nullptr,
+                   char const *name = nullptr);
+  // ========================================
   // Inference APIs
   // ========================================
   GenerationResult generate(std::vector<std::string> &prompts,
@@ -1179,6 +1192,10 @@ class FFModel {
           SigmoidSiluMulti *>,
       std::unordered_map<std::pair<ParallelTensorShape, LinearParams>,
                          Linear *>,
+      std::unordered_map<
+          std::pair<std::pair<ParallelTensorShape, ParallelTensorShape>,
+                    LoraLinearParams>,
+          LoraLinear *>,
       std::unordered_map<std::pair<ParallelTensorShape, Pool2DParams>,
                          Pool2D *>,
       std::unordered_map<std::pair<std::tuple<ParallelTensorShape,
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 5b187839ef..478d077ed3 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -23,6 +23,7 @@
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
+#include "flexflow/ops/lora_linear_params.h"
 #include "flexflow/ops/pool_2d_params.h"
 #include "flexflow/ops/reduce_params.h"
 #include "flexflow/ops/reshape_params.h"
@@ -67,6 +68,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        AddBiasResidualLayerNormParams,
                                        SigmoidSiluMultiParams,
                                        LinearParams,
+                                       LoraLinearParams,
                                        MultiHeadAttentionParams,
                                        IncMultiHeadSelfAttentionParams,
                                        BeamTopKParams,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
new file mode 100644
index 0000000000..f60ee4c17b
--- /dev/null
+++ b/include/flexflow/ops/lora_linear.h
@@ -0,0 +1,112 @@
+#ifndef _FLEXFLOW_LORA_LINEAR_FIRST_H
+#define _FLEXFLOW_LORA_LINEAR_FIRST_H
+
+#include "flexflow/inference.h"
+#include "flexflow/node.h"
+#include "flexflow/operator.h"
+#include "flexflow/ops/lora_linear_params.h"
+#include "flexflow/utils/memory_allocator.h"
+
+namespace FlexFlow {
+
+class FFModel;
+class Layer;
+
+class LoraLinear : public Op {
+public:
+  using Params = LoraLinearParams;
+  using Input = std::pair<ParallelTensor, ParallelTensor>;
+
+  LoraLinear(FFModel &model,
+             LayerID const &layer_guid,
+             ParallelTensor const input,
+             ParallelTensor const output,
+             int rank,
+             DataType _data_type,
+             bool allocate_weights,
+             char const *name);
+  LoraLinear(FFModel &model,
+             LoraLinear const &other,
+             ParallelTensor const input,
+             ParallelTensor const output,
+             bool allocate_weights);
+  LoraLinear(FFModel &model,
+             Params const &params,
+             Input const &inputs,
+             bool allocate_weights = false,
+             char const *name = nullptr);
+
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override;
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void forward_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+  void serialize(Legion::Serializer &) const override;
+  static PCG::Node deserialize(FFModel &ff,
+                               Legion::Deserializer &d,
+                               ParallelTensor inputs[],
+                               int num_inputs);
+
+  // size_t get_params_hash() const override;
+  LoraLinearParams get_params() const;
+
+private:
+  LoraLinear(int guid,
+             bool profiling,
+             ParallelTensor const input,
+             ParallelTensor const output,
+             int rank,
+             bool allocate_weights,
+             char const *name);
+
+  void register_mappings();
+  void register_output_mappings();
+  void register_weight_mappings();
+
+public:
+  int rank;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXLOW_LORA_LINEAR_FIRST_H
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index bb444ea0ab..1978497c14 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -365,7 +365,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         dims,
         quantization_type == DT_NONE ? this->data_type : quantization_type,
         nullptr /*owner_op*/,
-        true /*create_grad*/,
+        model.config.computationMode == COMP_MODE_INFERENCE
+            ? false
+            : true /*create_grad*/,
         initializer,
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 0f60bfe17b..edf3cdaf07 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -451,8 +451,10 @@ void peft_bwd_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-  // update input_grad_ptr offset
-  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens;
+  // update input_grad_ptr and output_grad_ptr offset
+  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens * in_dim;
+  output_grad_ptr =
+      static_cast<DT *>(output_grad_ptr) + num_infr_tokens * out_dim;
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
new file mode 100644
index 0000000000..94b62bb399
--- /dev/null
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -0,0 +1,373 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+
+LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
+    : OpMeta(handler, li) {}
+
+LoraLinearMeta::~LoraLinearMeta(void) {}
+
+namespace Kernels {
+namespace LoraLinear {
+
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_first_ptr,
+                              void const *weight_second_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int rank,
+                              int num_infr_tokens,
+                              int num_peft_tokens) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::inference_kernel<float>(m,
+                                      input_ptr,
+                                      output_ptr,
+                                      weight_first_ptr,
+                                      weight_second_ptr,
+                                      in_dim,
+                                      out_dim,
+                                      rank,
+                                      num_infr_tokens,
+                                      num_peft_tokens,
+                                      stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::inference_kernel<half>(m,
+                                     input_ptr,
+                                     output_ptr,
+                                     weight_first_ptr,
+                                     weight_second_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     rank,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             void *input_grad_ptr,
+                             void const *output_grad_ptr,
+                             void const *weight_first_ptr,
+                             void const *weight_second_ptr,
+                             void *weight_first_grad_ptr,
+                             void *weight_second_grad_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int rank,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_first_ptr,
+                                     weight_second_ptr,
+                                     weight_first_grad_ptr,
+                                     weight_second_grad_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     rank,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_first_ptr,
+                                    weight_second_ptr,
+                                    weight_first_grad_ptr,
+                                    weight_second_grad_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    rank,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+namespace Internal {
+
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      void const *input_ptr,
+                      void *output_ptr,
+                      void const *weight_first_ptr,
+                      void const *weight_second_ptr,
+                      int in_dim,
+                      int out_dim,
+                      int rank,
+                      int num_infr_tokens,
+                      int num_peft_tokens,
+                      ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  DT alpha = 1.0f, beta = 0.0f;
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  assert(m->weight_type[1] == weight_type);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+  cudaDataType_t lr_actv_type = output_type;
+  assert(input_type == weight_type && weight_type == output_type);
+  // adjust input_ptr and output_ptr offset
+  // TODO: we currently assume that all inference tokens do not use LoRA
+  input_ptr = static_cast<DT const *>(input_ptr) + num_infr_tokens * in_dim;
+  output_ptr = static_cast<DT *>(output_ptr) + num_infr_tokens * out_dim;
+
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = input_type;
+#endif
+  MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+  m->input_activation = allocator->allocate_instance_untyped(
+      data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+  m->low_rank_activation = allocator->allocate_instance_untyped(
+      data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+  // copy input activation
+  checkCUDA(cudaMemcpyAsync(m->input_activation,
+                            input_ptr,
+                            data_type_size(m->input_type[0]) * num_peft_tokens *
+                                in_dim,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
+  // buffer = weight_first * input
+  checkCUDA(cublasGemmEx(m->handle.blas,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         rank,
+                         num_peft_tokens,
+                         in_dim,
+                         &alpha,
+                         weight_first_ptr,
+                         weight_type,
+                         in_dim,
+                         input_ptr,
+                         input_type,
+                         in_dim,
+                         &beta,
+                         m->low_rank_activation,
+                         lr_actv_type,
+                         rank,
+                         compute_type,
+                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  // output = weight_second * buffer
+  // Note that we use alpha in both places since we do
+  // an in-place update for LoraLinear
+  checkCUDA(cublasGemmEx(m->handle.blas,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         out_dim,
+                         num_peft_tokens,
+                         rank,
+                         &alpha,
+                         weight_second_ptr,
+                         weight_type,
+                         rank,
+                         m->low_rank_activation,
+                         lr_actv_type,
+                         rank,
+                         &alpha,
+                         output_ptr,
+                         output_type,
+                         out_dim,
+                         compute_type,
+                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     void *input_grad_ptr,
+                     void const *output_grad_ptr,
+                     void const *weight_first_ptr,
+                     void const *weight_second_ptr,
+                     void *weight_first_grad_ptr,
+                     void *weight_second_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int rank,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  DT alpha = 1.0f;
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  assert(weight_type == ff_to_cuda_datatype(m->weight_type[1]));
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudaDataType_t lr_actv_type = output_type;
+  // update input_grad_ptr and output_grad_ptr offset
+  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens * in_dim;
+  output_grad_ptr =
+      static_cast<DT const *>(output_grad_ptr) + num_infr_tokens * out_dim;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = CUDA_R_32F;
+#endif
+  // Compute weight_second gradiant
+  // NOTE: we use alpha=1 for weight_second_grad to accumulate gradients
+  checkCUDA(cublasGemmEx(m->handle.blas,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_T,
+                         rank,
+                         out_dim,
+                         num_peft_tokens,
+                         &alpha,
+                         m->low_rank_activation,
+                         lr_actv_type,
+                         rank,
+                         output_grad_ptr,
+                         output_type,
+                         out_dim,
+                         &alpha,
+                         weight_second_grad_ptr,
+                         weight_type,
+                         rank,
+                         compute_type,
+                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  // Compute gradiants w.r.t. low_rank activation
+  // and save the results to low_rank_activation
+  // NOTE: we use alpha=1 for input_grad to accumulate gradients
+  checkCUDA(cublasGemmEx(m->handle.blas,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         rank,
+                         num_peft_tokens,
+                         out_dim,
+                         &alpha,
+                         weight_second_ptr,
+                         weight_type,
+                         rank,
+                         output_grad_ptr,
+                         output_type,
+                         out_dim,
+                         &alpha,
+                         m->low_rank_activation,
+                         lr_actv_type,
+                         rank,
+                         compute_type,
+                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  // Compute weight_first gradiant
+  // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
+  checkCUDA(cublasGemmEx(m->handle.blas,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_T,
+                         in_dim,
+                         rank,
+                         num_peft_tokens,
+                         &alpha,
+                         m->input_activation,
+                         input_type,
+                         in_dim,
+                         m->low_rank_activation,
+                         lr_actv_type,
+                         rank,
+                         &alpha,
+                         weight_first_grad_ptr,
+                         weight_type,
+                         in_dim,
+                         compute_type,
+                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  // Compute input gradiant
+  // NOTE: we use alpha=1 for input_grad to accumulate gradients
+  if (input_grad_ptr != nullptr) {
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           in_dim,
+                           num_peft_tokens,
+                           rank,
+                           &alpha,
+                           weight_first_ptr,
+                           weight_type,
+                           in_dim,
+                           m->low_rank_activation,
+                           lr_actv_type,
+                           rank,
+                           &alpha,
+                           input_grad_ptr,
+                           input_type,
+                           in_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
new file mode 100644
index 0000000000..e9da54b04b
--- /dev/null
+++ b/src/ops/lora_linear.cc
@@ -0,0 +1,599 @@
+#include "flexflow/ops/lora_linear.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/layer.h"
+#include "flexflow/model.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/utils/hash_utils.h"
+#include "legion/legion_utilities.h"
+
+namespace FlexFlow {
+
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+
+using namespace FlexFlow::Kernels::LoraLinear;
+
+void FFModel::lora_linear(Tensor const input,
+                          Tensor const output,
+                          int rank,
+                          DataType data_type,
+                          Initializer *kernel_initializer,
+                          char const *name) {
+  if (data_type == DT_NONE) {
+    data_type = input->data_type;
+  }
+  Layer *li = nullptr;
+  if (data_type != input->data_type) {
+    Tensor casted_input = cast(input, data_type, "type cast for dense");
+    li = new Layer(this,
+                   OP_LORA_LINEAR,
+                   data_type,
+                   name,
+                   2 /*inputs*/,
+                   2 /*weights*/,
+                   0 /*outputs*/,
+                   casted_input);
+  } else {
+    li = new Layer(this,
+                   OP_LORA_LINEAR,
+                   data_type,
+                   name,
+                   2 /*inputs*/,
+                   2 /*weights*/,
+                   0 /*outputs*/,
+                   input);
+  }
+  {
+    int dims[2] = {input->dims[0], rank};
+    li->weights[0] = create_weight_legion_ordering(2,
+                                                   dims,
+                                                   data_type,
+                                                   li,
+                                                   true /*create_grad*/,
+                                                   kernel_initializer,
+                                                   CHOSEN_SYNC_TYPE);
+  }
+  {
+    int dims[2] = {rank, output->dims[0]};
+    li->weights[1] = create_weight_legion_ordering(2,
+                                                   dims,
+                                                   data_type,
+                                                   li,
+                                                   true /*create_grad*/,
+                                                   kernel_initializer,
+                                                   CHOSEN_SYNC_TYPE);
+  }
+  li->add_int_property("rank", rank);
+  layers.push_back(li);
+}
+
+Op *LoraLinear::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  long long value;
+  layer->get_int_property("rank", value);
+  int rank = (int)value;
+  return new LoraLinear(model,
+                        layer->layer_guid,
+                        inputs[0],
+                        inputs[1],
+                        rank,
+                        layer->data_type,
+                        false /*allocate_weights*/,
+                        layer->name);
+}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       LoraLinear const &other,
+                       ParallelTensor const input,
+                       ParallelTensor const output,
+                       bool allocate_weights)
+    : LoraLinear(model,
+                 other.layer_guid,
+                 input,
+                 output,
+                 other.rank,
+                 other.data_type,
+                 allocate_weights,
+                 other.name) {}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       Params const &params,
+                       Input const &inputs,
+                       bool allocate_weights,
+                       char const *name)
+    : LoraLinear(model,
+                 params.layer_guid,
+                 inputs.first,
+                 inputs.second,
+                 params.rank,
+                 params.data_type,
+                 allocate_weights,
+                 name) {}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       LayerID const &_layer_guid,
+                       ParallelTensor const _input,
+                       ParallelTensor const _output,
+                       int _rank,
+                       DataType _data_type,
+                       bool allocate_weights,
+                       char const *name)
+    : Op(model,
+         OP_LORA_LINEAR,
+         _data_type,
+         name,
+         2 /*inputs*/,
+         2 /*weights*/,
+         allocate_weights,
+         0 /*outputs*/,
+         _input,
+         _output),
+      rank(_rank) {
+  // overwrite layer_guid
+  layer_guid = _layer_guid;
+  data_type = _data_type;
+
+  ParallelTensorShape input_shape = this->inputs[0]->get_shape();
+  LoraLinearParams params = this->get_params();
+
+  if (allocate_weights) {
+    Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/);
+    // create weight first
+    {
+      ParallelDim dims[2];
+      int num_dims = inputs[0]->num_dims;
+      dims[1] = inputs[0]->dims[num_dims - 1]; // data parallel
+      dims[1].size = dims[1].degree;
+      dims[1].is_replica_dim = true;
+      dims[0] = inputs[0]->dims[0];
+      dims[0].size = inputs[0]->dims[0].size * rank;
+      weights[0] =
+          model.create_parallel_weight_legion_ordering(2,
+                                                       dims,
+                                                       this->data_type,
+                                                       nullptr /*owner_op*/,
+                                                       true /*create_grad*/,
+                                                       kernel_initializer,
+                                                       CHOSEN_SYNC_TYPE);
+    }
+    // create weight second
+    {
+      ParallelDim dims[2];
+      int num_dims = inputs[0]->num_dims;
+      dims[1] = inputs[0]->dims[0];
+      dims[1].size = dims[1].degree;
+      dims[1].is_replica_dim = true;
+      dims[0] = inputs[1]->dims[0];
+      dims[0].size = inputs[1]->dims[0].size * rank;
+      weights[1] =
+          model.create_parallel_weight_legion_ordering(2,
+                                                       dims,
+                                                       this->data_type,
+                                                       nullptr /*owner_op*/,
+                                                       true /*create_grad*/,
+                                                       kernel_initializer,
+                                                       CHOSEN_SYNC_TYPE);
+    }
+  }
+
+  // assert(check_output_input_weight_parallel_dims(allocate_weights));
+}
+
+void LoraLinear::init(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal init");
+}
+
+void LoraLinear::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 0);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_inputs[1];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, output_tensor);
+  IndexLauncher launcher(LORA_LINEAR_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(LoraLinear)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(2, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[1]->region));
+  launcher.add_field(3, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
+}
+
+/*
+  regions[0](O): output
+  regions[1](I): kernel
+  regions[2](I): bias
+*/
+OpMeta *LoraLinear::init_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  LoraLinear const *lora = (LoraLinear *)task->args;
+  FFHandler handle = *((FFHandler const *)task->local_args);
+  GenericTensorAccessorR input =
+      helperGetGenericTensorAccessorRO(lora->inputs[0]->data_type,
+                                       regions[0],
+                                       task->regions[0],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW output =
+      helperGetGenericTensorAccessorRW(lora->inputs[1]->data_type,
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW weight_first =
+      helperGetGenericTensorAccessorRW(lora->weights[0]->data_type,
+                                       regions[2],
+                                       task->regions[2],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW weight_second =
+      helperGetGenericTensorAccessorRW(lora->weights[1]->data_type,
+                                       regions[3],
+                                       task->regions[3],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int rank = lora->rank;
+  int batch_size = output.domain.get_volume() / out_dim;
+  assert(input.domain.get_volume() == in_dim * batch_size);
+  assert(weight_first.domain.get_volume() == in_dim * rank);
+  assert(weight_second.domain.get_volume() == out_dim * rank);
+
+  LoraLinearMeta *m = new LoraLinearMeta(handle, lora);
+  m->trainable_inputs[0] = lora->trainable_inputs[0];
+  std::strcpy(m->op_name, lora->name);
+
+  return m;
+}
+
+void LoraLinear::forward(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal init");
+}
+
+FutureMap
+    LoraLinear::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 0);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_inputs[1];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  IndexLauncher launcher(LORA_LINEAR_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(2, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[1]->region));
+  launcher.add_field(3, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void LoraLinear::inference_task(Task const *task,
+                                std::vector<PhysicalRegion> const &regions,
+                                Context ctx,
+                                Runtime *runtime) {
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 4);
+  assert(task->regions.size() == regions.size());
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int rank = weight_first.domain.get_volume() / in_dim;
+  assert(in_dim * rank == weight_first.domain.get_volume());
+  assert(out_dim * rank == weight_second.domain.get_volume());
+
+  int num_infr_tokens = bc->num_active_infr_tokens();
+  int num_peft_tokens = bc->num_active_peft_tokens();
+  inference_kernel_wrapper(m,
+                           input.ptr,
+                           output.ptr,
+                           weight_first.ptr,
+                           weight_second.ptr,
+                           in_dim,
+                           out_dim,
+                           rank,
+                           num_infr_tokens,
+                           num_peft_tokens);
+}
+
+FutureMap LoraLinear::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  ParallelTensor output_tensor = batch_inputs[1];
+  parallel_is = output_tensor->parallel_is;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(2, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[1]->region));
+  launcher.add_field(3, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(4, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[1]->region_grad));
+  launcher.add_field(5, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void LoraLinear::peft_bwd_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  Domain input_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
+    return;
+  }
+  assert(regions.size() == 6);
+  assert(task->regions.size() == regions.size());
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_first_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_second_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime);
+
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int rank = weight_first.domain.get_volume() / in_dim;
+  assert(in_dim * rank == weight_first.domain.get_volume());
+  assert(out_dim * rank == weight_second.domain.get_volume());
+  assert(weight_first.domain == weight_first_grad.domain);
+  assert(weight_second.domain == weight_second_grad.domain);
+
+  int num_infr_tokens = bc->num_active_infr_tokens();
+  int num_peft_tokens = bc->num_active_peft_tokens();
+  peft_bwd_kernel_wrapper(m,
+                          input_grad.ptr,
+                          output_grad.ptr,
+                          weight_first.ptr,
+                          weight_second.ptr,
+                          weight_first_grad.ptr,
+                          weight_second_grad.ptr,
+                          in_dim,
+                          out_dim,
+                          rank,
+                          num_infr_tokens,
+                          num_peft_tokens);
+}
+
+void LoraLinear::backward(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal backward");
+}
+
+void LoraLinear::print_layer(FFModel const &ff) {}
+
+bool LoraLinear::measure_operator_cost(Simulator *sim,
+                                       MachineView const &mv,
+                                       CostMetrics &cost_metrics) const {
+  return false;
+}
+
+bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
+  return lhs.layer_guid == rhs.layer_guid && lhs.rank == rhs.rank &&
+         lhs.data_type == rhs.data_type;
+}
+
+void LoraLinear::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->layer_guid.id);
+  sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->rank);
+  sez.serialize(this->data_type);
+}
+
+/* static */
+using PCG::Node;
+Node LoraLinear::deserialize(FFModel &ff,
+                             Legion::Deserializer &dez,
+                             ParallelTensor inputs[],
+                             int num_inputs) {
+  assert(num_inputs == 2);
+  int rank;
+  DataType data_type;
+  size_t id, transformer_layer_id;
+  dez.deserialize(id);
+  dez.deserialize(transformer_layer_id);
+  LayerID layer_guid(id, transformer_layer_id);
+  dez.deserialize(rank);
+  dez.deserialize(data_type);
+
+  LoraLinearParams params;
+  params.rank = rank;
+  params.data_type = data_type;
+  params.layer_guid = layer_guid;
+  return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
+}
+
+LoraLinearParams LoraLinear::get_params() const {
+  LoraLinearParams params;
+  params.layer_guid = this->layer_guid;
+  params.rank = this->rank;
+  params.data_type = this->data_type;
+  return params;
+}
+
+bool LoraLinearParams::is_valid(
+    std::pair<ParallelTensorShape, ParallelTensorShape> const &input_shape)
+    const {
+  return true;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::LoraLinearParams>::operator()(
+    FlexFlow::LoraLinearParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.rank);
+  hash_combine(key, params.data_type);
+  return key;
+}
+}; // namespace std
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 6eb2c163ce..33567832f5 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -69,6 +69,10 @@ int BatchConfig::num_active_requests() const {
   return num_requests;
 }
 
+int BatchConfig::num_active_tokens() const {
+  return num_tokens;
+}
+
 int BatchConfig::num_active_infr_tokens() const {
   return num_tokens;
 }
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 263405f8ab..c77c4d2432 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -47,6 +47,7 @@
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/ops/pool_2d.h"
 #include "flexflow/ops/reduce.h"
@@ -6211,6 +6212,54 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           TreeIncMultiHeadSelfAttention::inference_task>(registrar);
     }
   }
+  // PEFT tasks
+  // LoraLinear tasks
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<OpMeta *, LoraLinear::init_task>(
+          registrar, "LoraLinear Init Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<OpMeta *, LoraLinear::init_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID,
+                                   "LoraLinear Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::inference_task>(
+          registrar, "LoraLinear Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::inference_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                                   "LoraLinear PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::peft_bwd_task>(
+          registrar, "LoraLinear PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::peft_bwd_task>(registrar);
+    }
+  }
+
   // NoOp
   {
     TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init");

From 66230bd1d6d50f9094d97ea31892df6f4ffa6ca8 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 3 Oct 2023 17:58:03 -0400
Subject: [PATCH 004/198] add missing files

---
 .../ops/kernels/lora_linear_kernels.h         | 80 +++++++++++++++++++
 include/flexflow/ops/lora_linear_params.h     | 32 ++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 include/flexflow/ops/kernels/lora_linear_kernels.h
 create mode 100644 include/flexflow/ops/lora_linear_params.h

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
new file mode 100644
index 0000000000..520030ece5
--- /dev/null
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -0,0 +1,80 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
+#define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
+
+#include "flexflow/device.h"
+#include "flexflow/fftype.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/ops/lora_linear.h"
+
+namespace FlexFlow {
+
+class LoraLinearMeta : public OpMeta {
+public:
+  LoraLinearMeta(FFHandler handle,
+                 LoraLinear const *li);
+  ~LoraLinearMeta(void);
+  char op_name[MAX_OPNAME];
+  // PEFT related fields
+  void *low_rank_activation;
+  void *input_activation;
+};
+
+namespace Kernels {
+namespace LoraLinear {
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_first_ptr,
+                              void const *weight_second_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int rank,
+                              int num_infr_tokens,
+                              int num_peft_tokens);
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             void *input_grad_ptr,
+                             void const *output_grad_ptr,
+                             void const *weight_first_ptr,
+                             void const *weight_second_ptr,
+                             void *weight_first_grad_ptr,
+                             void *weight_second_grad_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int rank,
+                             int num_infr_tokens,
+                             int num_peft_tokens);
+bool use_activation(ActiMode mode);
+
+namespace Internal {
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      void const *input_ptr,
+                      void *output_ptr,
+                      void const *weight_first_ptr,
+                      void const *weight_second_ptr,
+                      int in_dim,
+                      int out_dim,
+                      int rank,
+                      int num_infr_tokens,
+                      int num_peft_tokens,
+                      ffStream_t stream);
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     void *input_grad_ptr,
+                     void const *output_grad_ptr,
+                     void const *weight_first_ptr,
+                     void const *weight_second_ptr,
+                     void *weight_first_grad_ptr,
+                     void *weight_second_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int rank,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream);
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
new file mode 100644
index 0000000000..545b39d8de
--- /dev/null
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LORA_LINEAR_PARAMS_H
+#define _FLEXFLOW_LORA_LINEAR_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/operator.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+class LoraLinearParams {
+public:
+  LayerID layer_guid;
+  int rank;
+  DataType data_type;
+
+  bool is_valid(
+      std::pair<ParallelTensorShape, ParallelTensorShape> const &input_shape) const;
+  friend bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs);
+};
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::LoraLinearParams> {
+  size_t operator()(FlexFlow::LoraLinearParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H

From f0d1155a6334b4a6babb5dbe9c8d65a208c10978 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 3 Oct 2023 18:02:38 -0400
Subject: [PATCH 005/198] format

---
 include/flexflow/ops/kernels/lora_linear_kernels.h | 3 +--
 include/flexflow/ops/lora_linear_params.h          | 7 ++++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 520030ece5..753167c9c4 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -10,8 +10,7 @@ namespace FlexFlow {
 
 class LoraLinearMeta : public OpMeta {
 public:
-  LoraLinearMeta(FFHandler handle,
-                 LoraLinear const *li);
+  LoraLinearMeta(FFHandler handle, LoraLinear const *li);
   ~LoraLinearMeta(void);
   char op_name[MAX_OPNAME];
   // PEFT related fields
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 545b39d8de..a19a2ff298 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -15,9 +15,10 @@ class LoraLinearParams {
   int rank;
   DataType data_type;
 
-  bool is_valid(
-      std::pair<ParallelTensorShape, ParallelTensorShape> const &input_shape) const;
-  friend bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs);
+  bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
+                    &input_shape) const;
+  friend bool operator==(LoraLinearParams const &lhs,
+                         LoraLinearParams const &rhs);
 };
 
 } // namespace FlexFlow

From fb203cced48365db63226444c2b3270e5d70a4c2 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 3 Oct 2023 20:55:56 -0400
Subject: [PATCH 006/198] LoraLinear now takes two inputs and generates one
 output

---
 include/flexflow/ops/lora_linear.h |  5 +-
 inference/file_loader.cc           |  4 ++
 inference/models/llama.cc          |  2 +
 src/ops/lora_linear.cc             | 93 ++++++++++++++++++++----------
 src/runtime/ffconst_utils.cc       |  3 +
 src/runtime/graph.cc               |  6 ++
 src/runtime/model.cc               |  6 ++
 7 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index f60ee4c17b..fff3927ff1 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -54,6 +54,7 @@ class LoraLinear : public Op {
                              std::vector<ParallelTensor> const &,
                              MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override;
+  void map_output_tensors(FFModel &model) override;
   static Op *
       create_operator_from_layer(FFModel &model,
                                  Layer const *layer,
@@ -86,7 +87,9 @@ class LoraLinear : public Op {
                                Legion::Deserializer &d,
                                ParallelTensor inputs[],
                                int num_inputs);
-
+  Op *materialize(FFModel &ff,
+                  ParallelTensor inputs[],
+                  int num_inputs) const override;
   // size_t get_params_hash() const override;
   LoraLinearParams get_params() const;
 
diff --git a/inference/file_loader.cc b/inference/file_loader.cc
index dc724319d2..f11df920e3 100644
--- a/inference/file_loader.cc
+++ b/inference/file_loader.cc
@@ -764,6 +764,10 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) {
       if (weight == NULL) {
         continue;
       }
+      // TODO: currently skip Lora layers
+      if (l->op_type == OP_LORA_LINEAR) {
+        continue;
+      }
       switch (weight->data_type) {
         case DT_HALF:
           load_single_weight_tensor<half>(ff, l, i);
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index b8fe70526d..da8fc4ee63 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -219,6 +219,8 @@ void LLAMA::create_llama_model(FFModel &ff,
                  0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    ff.lora_linear(multi, w2, 16 /*rank*/);
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index e9da54b04b..bbfa120886 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -39,26 +39,26 @@ void FFModel::lora_linear(Tensor const input,
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
+  assert(data_type == input->data_type);
+  assert(data_type == output->data_type);
   Layer *li = nullptr;
-  if (data_type != input->data_type) {
-    Tensor casted_input = cast(input, data_type, "type cast for dense");
-    li = new Layer(this,
-                   OP_LORA_LINEAR,
-                   data_type,
-                   name,
-                   2 /*inputs*/,
-                   2 /*weights*/,
-                   0 /*outputs*/,
-                   casted_input);
-  } else {
-    li = new Layer(this,
-                   OP_LORA_LINEAR,
-                   data_type,
-                   name,
-                   2 /*inputs*/,
-                   2 /*weights*/,
-                   0 /*outputs*/,
-                   input);
+  li = new Layer(this,
+                 OP_LORA_LINEAR,
+                 data_type,
+                 name,
+                 2 /*inputs*/,
+                 2 /*weights*/,
+                 1 /*outputs*/,
+                 input,
+                 output);
+  {
+    int numdims = output->num_dims;
+    int dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdims; i++) {
+      dims[i] = output->dims[i];
+    }
+    li->outputs[0] = create_tensor_legion_ordering(
+        numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   {
     int dims[2] = {input->dims[0], rank};
@@ -144,7 +144,7 @@ LoraLinear::LoraLinear(FFModel &model,
          2 /*inputs*/,
          2 /*weights*/,
          allocate_weights,
-         0 /*outputs*/,
+         1 /*outputs*/,
          _input,
          _output),
       rank(_rank) {
@@ -194,7 +194,16 @@ LoraLinear::LoraLinear(FFModel &model,
                                                        CHOSEN_SYNC_TYPE);
     }
   }
-
+  // Create output tensor
+  {
+    int numdim = inputs[1]->num_dims;
+    ParallelDim dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdim; i++) {
+      dims[i] = inputs[1]->dims[i];
+    }
+    outputs[0] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, inputs[1]->data_type, this);
+  }
   // assert(check_output_input_weight_parallel_dims(allocate_weights));
 }
 
@@ -209,10 +218,12 @@ void LoraLinear::init_inference(
     MachineView const *mv) {
   assert(check_output_input_weight_same_parallel_is());
   assert(batch_inputs.size() == 2);
-  assert(batch_outputs.size() == 0);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output is the same as the second input
+  assert(batch_outputs[0] == batch_inputs[1]);
   // assert(check_output_input_weight_same_machine_view());
   // output is considered as an input to allow in-place optimization
-  ParallelTensor output_tensor = batch_inputs[1];
+  ParallelTensor output_tensor = batch_outputs[0];
   parallel_is = output_tensor->parallel_is;
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -276,7 +287,7 @@ OpMeta *LoraLinear::init_task(Task const *task,
                                        ctx,
                                        runtime);
   GenericTensorAccessorW output =
-      helperGetGenericTensorAccessorRW(lora->inputs[1]->data_type,
+      helperGetGenericTensorAccessorRW(lora->outputs[0]->data_type,
                                        regions[1],
                                        task->regions[1],
                                        FID_DATA,
@@ -323,10 +334,12 @@ FutureMap
                           MachineView const *mv) {
   assert(check_output_input_weight_same_parallel_is());
   assert(batch_inputs.size() == 2);
-  assert(batch_outputs.size() == 0);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output is the same as the second input
+  assert(batch_outputs[0] == batch_inputs[1]);
   // assert(check_output_input_weight_same_machine_view());
   // output is considered as an input to allow in-place optimization
-  ParallelTensor output_tensor = batch_inputs[1];
+  ParallelTensor output_tensor = batch_outputs[0];
   parallel_is = output_tensor->parallel_is;
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -416,10 +429,14 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
                                MachineView const *mv) {
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output is the same as the second input
+  assert(batch_outputs[0] == batch_inputs[1]);
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
-  ParallelTensor output_tensor = batch_inputs[1];
+  ParallelTensor output_tensor = batch_outputs[0];
   parallel_is = output_tensor->parallel_is;
   MachineView const *view = mv ? mv : &output_tensor->machine_view;
   set_argumentmap_for_inference(ff, argmap, output_tensor);
@@ -494,11 +511,11 @@ void LoraLinear::peft_bwd_task(Task const *task,
   GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+      m->weight_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   GenericTensorAccessorW weight_first_grad = helperGetGenericTensorAccessorRW(
       m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
   GenericTensorAccessorW weight_second_grad = helperGetGenericTensorAccessorRW(
-      m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime);
+      m->weight_type[1], regions[5], task->regions[5], FID_DATA, ctx, runtime);
 
   int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
@@ -530,6 +547,17 @@ void LoraLinear::backward(FFModel const &ff) {
 
 void LoraLinear::print_layer(FFModel const &ff) {}
 
+void LoraLinear::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 1);
+  assert(numInputs == 2);
+  assert(outputs[0]->get_volume() == inputs[1]->get_volume());
+  outputs[0]->parallel_is = inputs[1]->parallel_is;
+  outputs[0]->region = inputs[1]->region;
+  outputs[0]->part = inputs[1]->part;
+  outputs[0]->region_grad = inputs[1]->region_grad;
+  outputs[0]->part_grad = inputs[1]->part_grad;
+}
+
 bool LoraLinear::measure_operator_cost(Simulator *sim,
                                        MachineView const &mv,
                                        CostMetrics &cost_metrics) const {
@@ -571,6 +599,13 @@ Node LoraLinear::deserialize(FFModel &ff,
   return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
 }
 
+Op *LoraLinear::materialize(FFModel &ff,
+                            ParallelTensor inputs[],
+                            int num_inputs) const {
+  LoraLinearParams params = get_params();
+  return new LoraLinear(ff, params, {inputs[0], inputs[1]}, this->name);
+}
+
 LoraLinearParams LoraLinear::get_params() const {
   LoraLinearParams params;
   params.layer_guid = this->layer_guid;
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index c7b6e1257a..47abcacd6a 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -188,6 +188,9 @@ std::string get_operator_type_name(OperatorType type) {
       return "Sampling";
     case OP_ARGMAX:
       return "ArgMax";
+    // PEFT Ops
+    case OP_LORA_LINEAR:
+      return "LoraLinear";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 408de57c54..2ed57cd21e 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -36,6 +36,7 @@
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/ops/pool_2d.h"
 #include "flexflow/ops/reduce.h"
@@ -1995,6 +1996,7 @@ std::pair<std::unique_ptr<Graph>, std::unordered_map<Node, MachineView>>
         mv.device_type = MachineView::GPU;
         mv.ndims = 1;
         int total_parallel_degree = 1;
+        assert(op->numOutputs > 0);
         for (int i = 0; i < op->outputs[0]->num_dims; i++) {
           total_parallel_degree *= op->outputs[0]->dims[i].degree;
         }
@@ -2722,6 +2724,10 @@ void FFModel::deserialize_graph_optimal_view(
         node = Linear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
+      case OP_LORA_LINEAR: {
+        node = LoraLinear::deserialize(*this, dez, inputs, num_inputs);
+        break;
+      }
       case OP_MULTIHEAD_ATTENTION: {
         assert(num_inputs == 3);
         int embed_dim, num_heads, k_dim, v_dim;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index c77c4d2432..2735513af2 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3215,6 +3215,12 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
+    // PEFT layers
+    case OP_LORA_LINEAR: {
+      Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
+      operators.push_back(op);
+      return op;
+    }
     default:
       assert(false);
   }

From c3d9c3801fcd6dcfce73b151e70da8cb31378f6a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 3 Oct 2023 23:56:53 -0400
Subject: [PATCH 007/198] LoRA forward pass works now

---
 .../ops/kernels/lora_linear_kernels.h         |  6 +--
 src/ops/fused.cc                              |  3 +-
 src/ops/fused.cu                              | 41 ++++++++++++++++++-
 src/ops/kernels/lora_linear_kernels.cu        |  2 +-
 src/ops/lora_linear.cc                        | 22 ++++++----
 src/runtime/inference_manager.cc              | 23 ++++++++++-
 6 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 753167c9c4..1ba7347f5e 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
-#define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
+#ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
+#define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
 
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
@@ -76,4 +76,4 @@ void peft_bwd_kernel(LoraLinearMeta *m,
 } // namespace Kernels
 } // namespace FlexFlow
 
-#endif // _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
+#endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 1d5db2f461..70650aef0d 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -361,8 +361,9 @@ void FusedOp::init_inference(FFModel const &ff,
       }
     }
     for (int i = 0; i < op_num_outputs[op]; i++) {
+      int my_off = op_output_idx[i + ooff];
       assert(op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_batch_outputs.push_back(batch_outputs[i + ooff]);
+      my_batch_outputs.push_back(batch_outputs[my_off]);
     }
     ioff += op_num_inputs[op];
     ooff += op_num_outputs[op];
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index f291ecfd67..ef9dc5d5c6 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -30,6 +30,7 @@
 #include "flexflow/ops/kernels/embedding_kernels.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
 #include "flexflow/ops/kernels/reshape_kernels.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
@@ -634,10 +635,11 @@ __host__ void
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
       // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-      my_output_accessor[i] = output_accessor[i + ooff];
+      my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -700,6 +702,43 @@ __host__ void
                                                 batch_size);
         break;
       }
+      case OP_LORA_LINEAR: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_accessor[0].domain;
+        Domain output_domain = my_output_accessor[0].domain;
+        Domain weight_first_domain = my_weight_accessor[0].domain;
+        Domain weight_second_domain = my_weight_accessor[1].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int rank = weight_first_domain.get_volume() / in_dim;
+        assert(in_dim * rank == weight_first_domain.get_volume());
+        assert(out_dim * rank == weight_second_domain.get_volume());
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] == 2);
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        int num_infr_tokens = bc->num_active_infr_tokens();
+        int num_peft_tokens = bc->num_active_peft_tokens();
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
+        Kernels::LoraLinear::inference_kernel_wrapper(m,
+                                                      my_input_accessor[0].ptr,
+                                                      my_output_accessor[0].ptr,
+                                                      my_weight_accessor[0].ptr,
+                                                      my_weight_accessor[1].ptr,
+                                                      in_dim,
+                                                      out_dim,
+                                                      rank,
+                                                      num_infr_tokens,
+                                                      num_peft_tokens);
+        break;
+      }
       case OP_BATCHMATMUL: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 0);
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 94b62bb399..282134817e 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -175,7 +175,7 @@ void inference_kernel(LoraLinearMeta *m,
   DT alpha = 1.0f, beta = 0.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
-  assert(m->weight_type[1] == weight_type);
+  assert(m->weight_type[1] == m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
   cudaDataType_t lr_actv_type = output_type;
   assert(input_type == weight_type && weight_type == output_type);
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index bbfa120886..43d1b4cef1 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -219,8 +219,10 @@ void LoraLinear::init_inference(
   assert(check_output_input_weight_same_parallel_is());
   assert(batch_inputs.size() == 2);
   assert(batch_outputs.size() == 1);
-  // Assert that the output is the same as the second input
-  assert(batch_outputs[0] == batch_inputs[1]);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
   // assert(check_output_input_weight_same_machine_view());
   // output is considered as an input to allow in-place optimization
   ParallelTensor output_tensor = batch_outputs[0];
@@ -253,13 +255,13 @@ void LoraLinear::init_inference(
   launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
   launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     weights[1]->region));
   launcher.add_field(3, FID_DATA);
@@ -335,8 +337,10 @@ FutureMap
   assert(check_output_input_weight_same_parallel_is());
   assert(batch_inputs.size() == 2);
   assert(batch_outputs.size() == 1);
-  // Assert that the output is the same as the second input
-  assert(batch_outputs[0] == batch_inputs[1]);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
   // assert(check_output_input_weight_same_machine_view());
   // output is considered as an input to allow in-place optimization
   ParallelTensor output_tensor = batch_outputs[0];
@@ -431,8 +435,10 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
                                MachineView const *mv) {
   assert(batch_inputs.size() == 2);
   assert(batch_outputs.size() == 1);
-  // Assert that the output is the same as the second input
-  assert(batch_outputs[0] == batch_inputs[1]);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 461873d798..199b94c72c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -142,7 +142,28 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     for (int i = 0; i < op->numOutputs; i++) {
       ParallelTensor pt_base = op->outputs[i];
       assert(tensor_buffer.find(pt_base) == tensor_buffer.end());
-
+      // no need to map inplace tensor
+      // A tensor is inplace if it shares the same region as another tensor
+      {
+        bool inplace = false;
+        for (int j = 0; j < op->numInputs; j++) {
+          if (op->inputs[j]->region == op->outputs[i]->region) {
+            assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
+            tensor_buffer[pt_base] = tensor_buffer[op->inputs[j]];
+            inplace = true;
+          }
+        }
+        for (int j = 0; j < i; j++) {
+          if (op->outputs[j]->region == op->outputs[i]->region) {
+            assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end());
+            tensor_buffer[pt_base] = tensor_buffer[op->outputs[j]];
+            inplace = true;
+          }
+        }
+        if (inplace) {
+          continue;
+        }
+      }
       if (op->op_type == OP_REPLICATE) {
         assert(op->numInputs == 1 && op->numOutputs == 1);
       }

From c4cfcc37f1d8f7da462076889c8f24749fbca43e Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sat, 7 Oct 2023 16:02:03 -0400
Subject: [PATCH 008/198] [LoraLinear] update to allocate weight through
 per-GPU PEFTWeightAllocator

---
 include/flexflow/batch_config.h               |  12 +
 include/flexflow/config.h                     |   2 +
 include/flexflow/ffconst.h                    |   2 +
 include/flexflow/fftype.h                     |  22 +
 include/flexflow/model.h                      |   4 +-
 .../ops/kernels/lora_linear_kernels.h         |  58 +--
 include/flexflow/ops/lora_linear.h            |  36 +-
 include/flexflow/ops/lora_linear_params.h     |   2 -
 .../flexflow/utils/peft_weight_allocator.h    |  92 ++++
 inference/models/llama.cc                     |   2 +-
 src/ops/fused.cu                              |  12 +-
 src/ops/inc_multihead_self_attention.cu       |   2 +-
 src/ops/kernels/lora_linear_kernels.cu        | 418 +++++++++---------
 src/ops/lora_linear.cc                        | 384 ++++++----------
 src/runtime/fftype.cc                         |  16 +
 src/runtime/model.cc                          |  16 +
 16 files changed, 547 insertions(+), 533 deletions(-)
 create mode 100644 include/flexflow/utils/peft_weight_allocator.h

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index fc243fb365..b26b9ef823 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
@@ -62,10 +63,21 @@ class BatchConfig {
   bool loading_prompt = false;
 
   struct PerRequestInfo {
+    PerRequestInfo() {
+      token_start_offset = 0;
+      num_tokens_in_batch = 0;
+      max_sequence_length = 0;
+      request_guid = 0;
+      peft_model_id = PEFTModelID::NO_ID;
+      peft_bwd = false;
+    }
     int token_start_offset;
     int num_tokens_in_batch;
     int max_sequence_length;
     RequestGuid request_guid;
+    // PEFT fields
+    PEFTModelID peft_model_id;
+    bool peft_bwd;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 1d74a38468..60d1cb17d2 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -65,6 +65,7 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 
 class FFConfig;
 class MemoryAllocator;
+class PEFTWeightAllocator;
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -82,6 +83,7 @@ struct FFHandler {
   void *peft_activation_reserve_space;
   size_t peft_activation_reserve_space_size;
   MemoryAllocator *peft_activation_allocator;
+  PEFTWeightAllocator *peft_weight_allocator;
   // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 37a178d952..efc37ce78d 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -271,5 +271,7 @@ enum {
   TENSOR_GUID_LAST_VALID = 3999999,
   PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
   NODE_GUID_FIRST_VALID = 5000000,
+  PEFT_MODEL_ID_FIRST_VALID = 6000000,
+  PEFT_MODEL_ID_LAST_VALID = 6999999
 };
 #endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 18ed6b8100..665de43c59 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -3,6 +3,7 @@
 
 #include "flexflow/ffconst.h"
 #include <cstddef>
+#include <functional>
 
 namespace FlexFlow {
 
@@ -18,6 +19,27 @@ class LayerID {
   size_t id, transformer_layer_id;
 };
 
+class PEFTModelID {
+public:
+  static const PEFTModelID NO_ID;
+  PEFTModelID();
+  PEFTModelID(size_t id);
+  bool is_valid_id() const;
+  friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+
+public:
+  size_t id;
+};
+
 }; // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<FlexFlow::PEFTModelID> {
+  size_t operator()(FlexFlow::PEFTModelID const &n) const {
+    return n.id;
+  }
+};
+} // namespace std
+
 #endif // _FF_TYPE_H
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 105c678ba9..cc8d2267cf 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -121,6 +121,7 @@ enum TaskIDs {
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
   LORA_LINEAR_INIT_TASK_ID,
+  LORA_LINEAR_REG_TASK_ID,
   LORA_LINEAR_INF_TASK_ID,
   LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,
@@ -809,9 +810,6 @@ class FFModel {
   // ========================================
   void lora_linear(Tensor const input,
                    Tensor const output,
-                   int rank,
-                   DataType data_type = DT_NONE,
-                   Initializer *kernel_initializer = nullptr,
                    char const *name = nullptr);
   // ========================================
   // Inference APIs
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 1ba7347f5e..32a6832e2e 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
 
+#include "flexflow/accessor.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -8,6 +9,12 @@
 
 namespace FlexFlow {
 
+struct LoraLinearWeight {
+  void *w0_ptr, *w1_ptr, *w0_grad_ptr, *w1_grad_ptr;
+  void *w0_state_ptr, *w1_state_ptr;
+  int rank;
+};
+
 class LoraLinearMeta : public OpMeta {
 public:
   LoraLinearMeta(FFHandler handle, LoraLinear const *li);
@@ -16,64 +23,39 @@ class LoraLinearMeta : public OpMeta {
   // PEFT related fields
   void *low_rank_activation;
   void *input_activation;
+  std::unordered_map<PEFTModelID, LoraLinearWeight> model_weights;
 };
 
 namespace Kernels {
 namespace LoraLinear {
 void inference_kernel_wrapper(LoraLinearMeta *m,
-                              void const *input_ptr,
-                              void *output_ptr,
-                              void const *weight_first_ptr,
-                              void const *weight_second_ptr,
-                              int in_dim,
-                              int out_dim,
-                              int rank,
-                              int num_infr_tokens,
-                              int num_peft_tokens);
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
 void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
-                             void *input_grad_ptr,
-                             void const *output_grad_ptr,
-                             void const *weight_first_ptr,
-                             void const *weight_second_ptr,
-                             void *weight_first_grad_ptr,
-                             void *weight_second_grad_ptr,
-                             int in_dim,
-                             int out_dim,
-                             int rank,
-                             int num_infr_tokens,
-                             int num_peft_tokens);
-bool use_activation(ActiMode mode);
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
 
 namespace Internal {
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
-                      void const *input_ptr,
-                      void *output_ptr,
-                      void const *weight_first_ptr,
-                      void const *weight_second_ptr,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
                       int in_dim,
                       int out_dim,
-                      int rank,
-                      int num_infr_tokens,
-                      int num_peft_tokens,
                       ffStream_t stream);
 template <typename DT>
 void peft_bwd_kernel(LoraLinearMeta *m,
-                     void *input_grad_ptr,
-                     void const *output_grad_ptr,
-                     void const *weight_first_ptr,
-                     void const *weight_second_ptr,
-                     void *weight_first_grad_ptr,
-                     void *weight_second_grad_ptr,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
                      int in_dim,
                      int out_dim,
-                     int rank,
-                     int num_infr_tokens,
-                     int num_peft_tokens,
                      ffStream_t stream);
 } // namespace Internal
 } // namespace LoraLinear
 } // namespace Kernels
 } // namespace FlexFlow
-
 #endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index fff3927ff1..39d8925262 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -21,19 +21,14 @@ class LoraLinear : public Op {
              LayerID const &layer_guid,
              ParallelTensor const input,
              ParallelTensor const output,
-             int rank,
-             DataType _data_type,
-             bool allocate_weights,
-             char const *name);
+             char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
              ParallelTensor const input,
-             ParallelTensor const output,
-             bool allocate_weights);
+             ParallelTensor const output);
   LoraLinear(FFModel &model,
              Params const &params,
              Input const &inputs,
-             bool allocate_weights = false,
              char const *name = nullptr);
 
   void init(FFModel const &) override;
@@ -43,6 +38,12 @@ class LoraLinear : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  void register_peft_model(FFModel const &ff,
+                           std::vector<ParallelTensor> const &batch_inputs,
+                           std::vector<ParallelTensor> const &batch_outputs,
+                           MachineView const *mv,
+                           PEFTModelID const &model_id,
+                           int rank);
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -63,6 +64,11 @@ class LoraLinear : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  static void
+      register_model_task(Legion::Task const *task,
+                          std::vector<Legion::PhysicalRegion> const &regions,
+                          Legion::Context ctx,
+                          Legion::Runtime *runtime);
   static void inference_task(Legion::Task const *task,
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
@@ -92,22 +98,6 @@ class LoraLinear : public Op {
                   int num_inputs) const override;
   // size_t get_params_hash() const override;
   LoraLinearParams get_params() const;
-
-private:
-  LoraLinear(int guid,
-             bool profiling,
-             ParallelTensor const input,
-             ParallelTensor const output,
-             int rank,
-             bool allocate_weights,
-             char const *name);
-
-  void register_mappings();
-  void register_output_mappings();
-  void register_weight_mappings();
-
-public:
-  int rank;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index a19a2ff298..9eaee3000b 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -12,8 +12,6 @@ namespace FlexFlow {
 class LoraLinearParams {
 public:
   LayerID layer_guid;
-  int rank;
-  DataType data_type;
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
                     &input_shape) const;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
new file mode 100644
index 0000000000..dae46a8af1
--- /dev/null
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
+#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
+
+#include "flexflow/config.h"
+#include <mutex>
+
+namespace FlexFlow {
+
+class PEFTWeightAllocator {
+public:
+  PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
+      : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0),
+        local_offset(_total_size) {}
+
+  inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id,
+                                             size_t datalen) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    void *ptr = static_cast<char *>(base_ptr) + sync_offset;
+    off_t model_sync_weights_offset = sync_offset;
+    size_t model_sync_weights_size = datalen;
+    if (sync_weights.find(peft_model_id) != sync_weights.end()) {
+      // Assert that sync weights for each PEFT model is consecutive
+      std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
+      assert(sync_offset == offset_and_size.first + offset_and_size.second);
+      model_sync_weights_offset = offset_and_size.first;
+      model_sync_weights_size = offset_and_size.second + datalen;
+    }
+    sync_offset += datalen;
+    assert(sync_offset < local_offset);
+    sync_weights[peft_model_id] =
+        std::make_pair(model_sync_weights_offset, model_sync_weights_size);
+    return ptr;
+  }
+
+  std::pair<void *, size_t>
+      get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    assert(sync_weights.find(peft_model_id) != sync_weights.end());
+    std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
+    return std::make_pair(static_cast<char *>(base_ptr) + offset_and_size.first,
+                          offset_and_size.second);
+  }
+
+  inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id,
+                                              size_t datalen) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    local_offset -= datalen;
+    assert(sync_offset < local_offset);
+    void *ptr = static_cast<char *>(base_ptr) + local_offset;
+    return ptr;
+  }
+
+  template <typename DT>
+  inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id,
+                                   size_t count) {
+    return static_cast<DT *>(
+        allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count));
+  }
+
+  template <typename DT>
+  inline DT *allocate_local_weights(PEFTModelID const &peft_model_id,
+                                    size_t count) {
+    return static_cast<DT *>(
+        allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count));
+  }
+
+public:
+  void *base_ptr;
+  size_t total_size;
+  off_t sync_offset, local_offset;
+  std::unordered_map<PEFTModelID, std::pair<off_t, size_t>> sync_weights;
+  std::mutex peft_weight_allocator_mutex;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index da8fc4ee63..f90040170e 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -220,7 +220,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(multi, w2, 16 /*rank*/);
+    ff.lora_linear(multi, w2);
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index ef9dc5d5c6..f6d8365f1f 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -727,16 +727,8 @@ __host__ void
         // Assert that the output and the second input are at the same place
         // since we ``inplace'' the output for LoRA
         assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
-        Kernels::LoraLinear::inference_kernel_wrapper(m,
-                                                      my_input_accessor[0].ptr,
-                                                      my_output_accessor[0].ptr,
-                                                      my_weight_accessor[0].ptr,
-                                                      my_weight_accessor[1].ptr,
-                                                      in_dim,
-                                                      out_dim,
-                                                      rank,
-                                                      num_infr_tokens,
-                                                      num_peft_tokens);
+        Kernels::LoraLinear::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index e0a441ea50..19f3aabb90 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -577,7 +577,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_infr_tokens();
+  int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
   int q_block_size = m->qProjSize * num_tokens;
   int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length();
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 282134817e..1e9069fa72 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -29,18 +29,15 @@ namespace Kernels {
 namespace LoraLinear {
 
 void inference_kernel_wrapper(LoraLinearMeta *m,
-                              void const *input_ptr,
-                              void *output_ptr,
-                              void const *weight_first_ptr,
-                              void const *weight_second_ptr,
-                              int in_dim,
-                              int out_dim,
-                              int rank,
-                              int num_infr_tokens,
-                              int num_peft_tokens) {
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
   if (m->profiling) {
     cudaEventCreate(&t_start);
     cudaEventCreate(&t_end);
@@ -48,27 +45,19 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
   }
   if (m->input_type[0] == DT_FLOAT) {
     Internal::inference_kernel<float>(m,
-                                      input_ptr,
-                                      output_ptr,
-                                      weight_first_ptr,
-                                      weight_second_ptr,
+                                      bc,
+                                      input.get_float_ptr(),
+                                      output.get_float_ptr(),
                                       in_dim,
                                       out_dim,
-                                      rank,
-                                      num_infr_tokens,
-                                      num_peft_tokens,
                                       stream);
   } else if (m->input_type[0] == DT_HALF) {
     Internal::inference_kernel<half>(m,
-                                     input_ptr,
-                                     output_ptr,
-                                     weight_first_ptr,
-                                     weight_second_ptr,
+                                     bc,
+                                     input.get_half_ptr(),
+                                     output.get_half_ptr(),
                                      in_dim,
                                      out_dim,
-                                     rank,
-                                     num_infr_tokens,
-                                     num_peft_tokens,
                                      stream);
   }
 
@@ -90,17 +79,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
 }
 
 void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
-                             void *input_grad_ptr,
-                             void const *output_grad_ptr,
-                             void const *weight_first_ptr,
-                             void const *weight_second_ptr,
-                             void *weight_first_grad_ptr,
-                             void *weight_second_grad_ptr,
-                             int in_dim,
-                             int out_dim,
-                             int rank,
-                             int num_infr_tokens,
-                             int num_peft_tokens) {
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -109,33 +90,23 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   if (m->input_type[0] == DT_FLOAT) {
     Internal::peft_bwd_kernel<float>(m,
-                                     input_grad_ptr,
-                                     output_grad_ptr,
-                                     weight_first_ptr,
-                                     weight_second_ptr,
-                                     weight_first_grad_ptr,
-                                     weight_second_grad_ptr,
+                                     bc,
+                                     input_grad.get_float_ptr(),
+                                     output_grad.get_float_ptr(),
                                      in_dim,
                                      out_dim,
-                                     rank,
-                                     num_infr_tokens,
-                                     num_peft_tokens,
                                      stream);
   } else if (m->input_type[0] == DT_HALF) {
     Internal::peft_bwd_kernel<half>(m,
-                                    input_grad_ptr,
-                                    output_grad_ptr,
-                                    weight_first_ptr,
-                                    weight_second_ptr,
-                                    weight_first_grad_ptr,
-                                    weight_second_grad_ptr,
+                                    bc,
+                                    input_grad.get_half_ptr(),
+                                    output_grad.get_half_ptr(),
                                     in_dim,
                                     out_dim,
-                                    rank,
-                                    num_infr_tokens,
-                                    num_peft_tokens,
                                     stream);
   }
 
@@ -160,15 +131,11 @@ namespace Internal {
 
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
-                      void const *input_ptr,
-                      void *output_ptr,
-                      void const *weight_first_ptr,
-                      void const *weight_second_ptr,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
                       int in_dim,
                       int out_dim,
-                      int rank,
-                      int num_infr_tokens,
-                      int num_peft_tokens,
                       ffStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -179,10 +146,6 @@ void inference_kernel(LoraLinearMeta *m,
   cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
   cudaDataType_t lr_actv_type = output_type;
   assert(input_type == weight_type && weight_type == output_type);
-  // adjust input_ptr and output_ptr offset
-  // TODO: we currently assume that all inference tokens do not use LoRA
-  input_ptr = static_cast<DT const *>(input_ptr) + num_infr_tokens * in_dim;
-  output_ptr = static_cast<DT *>(output_ptr) + num_infr_tokens * out_dim;
 
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
@@ -190,75 +153,105 @@ void inference_kernel(LoraLinearMeta *m,
 #else
   cudaDataType_t compute_type = input_type;
 #endif
-  MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-  m->input_activation = allocator->allocate_instance_untyped(
-      data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
-  m->low_rank_activation = allocator->allocate_instance_untyped(
-      data_type_size(m->input_type[1]) * num_peft_tokens * rank);
-  // copy input activation
-  checkCUDA(cudaMemcpyAsync(m->input_activation,
-                            input_ptr,
-                            data_type_size(m->input_type[0]) * num_peft_tokens *
-                                in_dim,
-                            cudaMemcpyDeviceToDevice,
-                            stream));
-  // buffer = weight_first * input
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         rank,
-                         num_peft_tokens,
-                         in_dim,
-                         &alpha,
-                         weight_first_ptr,
-                         weight_type,
-                         in_dim,
-                         input_ptr,
-                         input_type,
-                         in_dim,
-                         &beta,
-                         m->low_rank_activation,
-                         lr_actv_type,
-                         rank,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  // output = weight_second * buffer
-  // Note that we use alpha in both places since we do
-  // an in-place update for LoraLinear
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         out_dim,
-                         num_peft_tokens,
-                         rank,
-                         &alpha,
-                         weight_second_ptr,
-                         weight_type,
-                         rank,
-                         m->low_rank_activation,
-                         lr_actv_type,
-                         rank,
-                         &alpha,
-                         output_ptr,
-                         output_type,
-                         out_dim,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  int num_peft_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+    }
+  }
+  // Assert that we have at most one request that requires peft_bwd
+  assert(num_peft_requests <= 1);
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      // FIXME: use the new approach to computing token offset
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_weights.end());
+    LoraLinearWeight weight =
+        m->model_weights[bc->requestsInfo[i].peft_model_id];
+    int rank = weight.rank;
+    if (bc->requestsInfo[i].peft_bwd) {
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      m->input_activation = allocator->allocate_instance_untyped(
+          data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+      m->low_rank_activation = allocator->allocate_instance_untyped(
+          data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      // copy input activation
+      checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                input_ptr + tokens_previous_requests * in_dim,
+                                data_type_size(m->input_type[0]) *
+                                    num_peft_tokens * in_dim,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
+    // buffer = weight_first * input
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           rank,
+                           num_peft_tokens,
+                           in_dim,
+                           &alpha,
+                           weight.w0_ptr,
+                           weight_type,
+                           in_dim,
+                           input_ptr + tokens_previous_requests * in_dim,
+                           input_type,
+                           in_dim,
+                           &beta,
+                           m->low_rank_activation,
+                           lr_actv_type,
+                           rank,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // output = weight_second * buffer
+    // Note that we use alpha in both places since we do
+    // an in-place update for LoraLinear
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           out_dim,
+                           num_peft_tokens,
+                           rank,
+                           &alpha,
+                           weight.w1_ptr,
+                           weight_type,
+                           rank,
+                           m->low_rank_activation,
+                           lr_actv_type,
+                           rank,
+                           &alpha,
+                           output_ptr + tokens_previous_requests * out_dim,
+                           output_type,
+                           out_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    tokens_previous_requests += num_peft_tokens;
+  }
+  assert(tokens_previous_requests == bc->num_active_tokens());
 }
 
 template <typename DT>
 void peft_bwd_kernel(LoraLinearMeta *m,
-                     void *input_grad_ptr,
-                     void const *output_grad_ptr,
-                     void const *weight_first_ptr,
-                     void const *weight_second_ptr,
-                     void *weight_first_grad_ptr,
-                     void *weight_second_grad_ptr,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
                      int in_dim,
                      int out_dim,
-                     int rank,
-                     int num_infr_tokens,
-                     int num_peft_tokens,
                      ffStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -268,103 +261,124 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   assert(weight_type == ff_to_cuda_datatype(m->weight_type[1]));
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   cudaDataType_t lr_actv_type = output_type;
-  // update input_grad_ptr and output_grad_ptr offset
-  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens * in_dim;
-  output_grad_ptr =
-      static_cast<DT const *>(output_grad_ptr) + num_infr_tokens * out_dim;
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
   cudaDataType_t compute_type = CUDA_R_32F;
 #endif
-  // Compute weight_second gradiant
-  // NOTE: we use alpha=1 for weight_second_grad to accumulate gradients
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_N,
-                         CUBLAS_OP_T,
-                         rank,
-                         out_dim,
-                         num_peft_tokens,
-                         &alpha,
-                         m->low_rank_activation,
-                         lr_actv_type,
-                         rank,
-                         output_grad_ptr,
-                         output_type,
-                         out_dim,
-                         &alpha,
-                         weight_second_grad_ptr,
-                         weight_type,
-                         rank,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  // Compute gradiants w.r.t. low_rank activation
-  // and save the results to low_rank_activation
-  // NOTE: we use alpha=1 for input_grad to accumulate gradients
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_N,
-                         CUBLAS_OP_N,
-                         rank,
-                         num_peft_tokens,
-                         out_dim,
-                         &alpha,
-                         weight_second_ptr,
-                         weight_type,
-                         rank,
-                         output_grad_ptr,
-                         output_type,
-                         out_dim,
-                         &alpha,
-                         m->low_rank_activation,
-                         lr_actv_type,
-                         rank,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  // Compute weight_first gradiant
-  // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_N,
-                         CUBLAS_OP_T,
-                         in_dim,
-                         rank,
-                         num_peft_tokens,
-                         &alpha,
-                         m->input_activation,
-                         input_type,
-                         in_dim,
-                         m->low_rank_activation,
-                         lr_actv_type,
-                         rank,
-                         &alpha,
-                         weight_first_grad_ptr,
-                         weight_type,
-                         in_dim,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  // Compute input gradiant
-  // NOTE: we use alpha=1 for input_grad to accumulate gradients
-  if (input_grad_ptr != nullptr) {
+
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_weights.end());
+    LoraLinearWeight weight =
+        m->model_weights[bc->requestsInfo[i].peft_model_id];
+    int rank = weight.rank;
+    // Compute w1's gradiant
+    // NOTE: we use alpha=1 for w1_grad to accumulate gradients
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
-                           CUBLAS_OP_N,
-                           in_dim,
+                           CUBLAS_OP_T,
+                           rank,
+                           out_dim,
                            num_peft_tokens,
+                           &alpha,
+                           m->low_rank_activation,
+                           lr_actv_type,
                            rank,
+                           output_grad_ptr + tokens_previous_requests * out_dim,
+                           output_type,
+                           out_dim,
                            &alpha,
-                           weight_first_ptr,
+                           weight.w1_grad_ptr,
                            weight_type,
-                           in_dim,
+                           rank,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // Compute gradiants w.r.t. low_rank activation
+    // and save the results to low_rank_activation
+    // NOTE: we use alpha=1 for input_grad to accumulate gradients
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           rank,
+                           num_peft_tokens,
+                           out_dim,
+                           &alpha,
+                           weight.w1_ptr,
+                           weight_type,
+                           rank,
+                           output_grad_ptr + tokens_previous_requests * out_dim,
+                           output_type,
+                           out_dim,
+                           &alpha,
                            m->low_rank_activation,
                            lr_actv_type,
                            rank,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // Compute w0's gradiant
+    // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_T,
+                           in_dim,
+                           rank,
+                           num_peft_tokens,
                            &alpha,
-                           input_grad_ptr,
+                           m->input_activation,
                            input_type,
                            in_dim,
+                           m->low_rank_activation,
+                           lr_actv_type,
+                           rank,
+                           &alpha,
+                           weight.w0_grad_ptr,
+                           weight_type,
+                           in_dim,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // Compute input gradiant
+    // NOTE: we use alpha=1 for input_grad to accumulate gradients
+    if (input_grad_ptr != nullptr) {
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             in_dim,
+                             num_peft_tokens,
+                             rank,
+                             &alpha,
+                             weight.w0_ptr,
+                             weight_type,
+                             in_dim,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             &alpha,
+                             input_grad_ptr + tokens_previous_requests * in_dim,
+                             input_type,
+                             in_dim,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    tokens_previous_requests += num_peft_tokens;
   }
+  assert(tokens_previous_requests == bc->num_active_tokens());
 }
 
 } // namespace Internal
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 43d1b4cef1..665c5cb4c5 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -4,6 +4,7 @@
 #include "flexflow/model.h"
 #include "flexflow/ops/kernels/lora_linear_kernels.h"
 #include "flexflow/utils/hash_utils.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
 
 namespace FlexFlow {
@@ -32,168 +33,73 @@ using namespace FlexFlow::Kernels::LoraLinear;
 
 void FFModel::lora_linear(Tensor const input,
                           Tensor const output,
-                          int rank,
-                          DataType data_type,
-                          Initializer *kernel_initializer,
                           char const *name) {
-  if (data_type == DT_NONE) {
-    data_type = input->data_type;
-  }
-  assert(data_type == input->data_type);
-  assert(data_type == output->data_type);
-  Layer *li = nullptr;
-  li = new Layer(this,
-                 OP_LORA_LINEAR,
-                 data_type,
-                 name,
-                 2 /*inputs*/,
-                 2 /*weights*/,
-                 1 /*outputs*/,
-                 input,
-                 output);
+  assert(input->data_type == output->data_type);
+  Layer *lora = nullptr;
+  lora = new Layer(this,
+                   OP_LORA_LINEAR,
+                   output->data_type,
+                   name,
+                   2 /*inputs*/,
+                   0 /*weights*/,
+                   1 /*outputs*/,
+                   input,
+                   output);
   {
     int numdims = output->num_dims;
     int dims[MAX_TENSOR_DIM];
     for (int i = 0; i < numdims; i++) {
       dims[i] = output->dims[i];
     }
-    li->outputs[0] = create_tensor_legion_ordering(
-        numdims, dims, data_type, li, 0, true /*create_grad*/);
-  }
-  {
-    int dims[2] = {input->dims[0], rank};
-    li->weights[0] = create_weight_legion_ordering(2,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
-  {
-    int dims[2] = {rank, output->dims[0]};
-    li->weights[1] = create_weight_legion_ordering(2,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
+    lora->outputs[0] = create_tensor_legion_ordering(
+        numdims, dims, output->data_type, lora, 0, true /*create_grad*/);
   }
-  li->add_int_property("rank", rank);
-  layers.push_back(li);
+  layers.push_back(lora);
 }
 
 Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("rank", value);
-  int rank = (int)value;
-  return new LoraLinear(model,
-                        layer->layer_guid,
-                        inputs[0],
-                        inputs[1],
-                        rank,
-                        layer->data_type,
-                        false /*allocate_weights*/,
-                        layer->name);
+  return new LoraLinear(
+      model, layer->layer_guid, inputs[0], inputs[1], layer->name);
 }
 
 LoraLinear::LoraLinear(FFModel &model,
                        LoraLinear const &other,
                        ParallelTensor const input,
-                       ParallelTensor const output,
-                       bool allocate_weights)
-    : LoraLinear(model,
-                 other.layer_guid,
-                 input,
-                 output,
-                 other.rank,
-                 other.data_type,
-                 allocate_weights,
-                 other.name) {}
+                       ParallelTensor const output)
+    : LoraLinear(model, other.layer_guid, input, output, other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        Params const &params,
                        Input const &inputs,
-                       bool allocate_weights,
                        char const *name)
-    : LoraLinear(model,
-                 params.layer_guid,
-                 inputs.first,
-                 inputs.second,
-                 params.rank,
-                 params.data_type,
-                 allocate_weights,
-                 name) {}
+    : LoraLinear(model, params.layer_guid, inputs.first, inputs.second, name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        LayerID const &_layer_guid,
                        ParallelTensor const _input,
                        ParallelTensor const _output,
-                       int _rank,
-                       DataType _data_type,
-                       bool allocate_weights,
                        char const *name)
     : Op(model,
          OP_LORA_LINEAR,
-         _data_type,
+         _output->data_type,
          name,
          2 /*inputs*/,
-         2 /*weights*/,
-         allocate_weights,
+         0 /*weights*/,
+         false,
          1 /*outputs*/,
          _input,
-         _output),
-      rank(_rank) {
+         _output) {
+  assert(_input->data_type == _output->data_type);
   // overwrite layer_guid
   layer_guid = _layer_guid;
-  data_type = _data_type;
+  data_type = _output->data_type;
 
   ParallelTensorShape input_shape = this->inputs[0]->get_shape();
   LoraLinearParams params = this->get_params();
 
-  if (allocate_weights) {
-    Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/);
-    // create weight first
-    {
-      ParallelDim dims[2];
-      int num_dims = inputs[0]->num_dims;
-      dims[1] = inputs[0]->dims[num_dims - 1]; // data parallel
-      dims[1].size = dims[1].degree;
-      dims[1].is_replica_dim = true;
-      dims[0] = inputs[0]->dims[0];
-      dims[0].size = inputs[0]->dims[0].size * rank;
-      weights[0] =
-          model.create_parallel_weight_legion_ordering(2,
-                                                       dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       kernel_initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-    // create weight second
-    {
-      ParallelDim dims[2];
-      int num_dims = inputs[0]->num_dims;
-      dims[1] = inputs[0]->dims[0];
-      dims[1].size = dims[1].degree;
-      dims[1].is_replica_dim = true;
-      dims[0] = inputs[1]->dims[0];
-      dims[0].size = inputs[1]->dims[0].size * rank;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(2,
-                                                       dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       kernel_initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
-  }
   // Create output tensor
   {
     int numdim = inputs[1]->num_dims;
@@ -253,18 +159,6 @@ void LoraLinear::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[1]->region));
-  launcher.add_field(3, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
@@ -295,27 +189,11 @@ OpMeta *LoraLinear::init_task(Task const *task,
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorW weight_first =
-      helperGetGenericTensorAccessorRW(lora->weights[0]->data_type,
-                                       regions[2],
-                                       task->regions[2],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
-  GenericTensorAccessorW weight_second =
-      helperGetGenericTensorAccessorRW(lora->weights[1]->data_type,
-                                       regions[3],
-                                       task->regions[3],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
   int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
-  int rank = lora->rank;
   int batch_size = output.domain.get_volume() / out_dim;
   assert(input.domain.get_volume() == in_dim * batch_size);
-  assert(weight_first.domain.get_volume() == in_dim * rank);
-  assert(weight_second.domain.get_volume() == out_dim * rank);
+  assert(output.domain.get_volume() == out_dim * batch_size);
 
   LoraLinearMeta *m = new LoraLinearMeta(handle, lora);
   m->trainable_inputs[0] = lora->trainable_inputs[0];
@@ -324,6 +202,96 @@ OpMeta *LoraLinear::init_task(Task const *task,
   return m;
 }
 
+struct LoraLinearRegisterInfo {
+  LoraLinear const *lora;
+  PEFTModelID model_id;
+  int rank;
+};
+
+void LoraLinear::register_peft_model(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv,
+    PEFTModelID const &model_id,
+    int rank) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  LoraLinearRegisterInfo info;
+  info.lora = this;
+  info.model_id = model_id;
+  info.rank = rank;
+  IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID,
+                         parallel_is,
+                         TaskArgument(&info, sizeof(LoraLinearRegisterInfo)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+}
+
+void LoraLinear::register_model_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  LoraLinearRegisterInfo const *info =
+      static_cast<LoraLinearRegisterInfo const *>(task->args);
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  LoraLinear const *lora = info->lora;
+  int rank = info->rank;
+  int num_dims = lora->inputs[0]->num_dims;
+  int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
+  int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
+  DataType dt = m->input_type[0];
+  assert(dt == m->input_type[1]);
+  assert(dt == m->output_type[1]);
+  assert(dt == lora->inputs[0]->data_type);
+  assert(m->model_weights.find(info->model_id) == m->model_weights.end());
+  LoraLinearWeight weight;
+  PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
+  weight.w0_ptr = allocator->allocate_local_weights_untyped(
+      info->model_id, rank * in_dim * data_type_size(dt));
+  weight.w1_ptr = allocator->allocate_local_weights_untyped(
+      info->model_id, rank * out_dim * data_type_size(dt));
+  weight.rank = rank;
+  if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+    // Input is partitioned (no replication)
+    // w0_grad is local weight gradients
+    weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
+        info->model_id, rank * in_dim * data_type_size(dt));
+    // w1_grad is sync weight gradients
+    weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
+        info->model_id, rank * out_dim * data_type_size(dt));
+  } else {
+    // Input is replicated
+    // w0_grad is sync weight gradients
+    weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
+        info->model_id, rank * in_dim * data_type_size(dt));
+    // w1_grad is local weight gradients
+    weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
+        info->model_id, rank * out_dim * data_type_size(dt));
+  }
+  m->model_weights[info->model_id] = weight;
+}
+
 void LoraLinear::forward(FFModel const &ff) {
   assert(false && "LoraLinear does not support normal init");
 }
@@ -372,18 +340,6 @@ FutureMap
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[1]->region));
-  launcher.add_field(3, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -404,28 +360,12 @@ void LoraLinear::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
-  int rank = weight_first.domain.get_volume() / in_dim;
-  assert(in_dim * rank == weight_first.domain.get_volume());
-  assert(out_dim * rank == weight_second.domain.get_volume());
-
-  int num_infr_tokens = bc->num_active_infr_tokens();
-  int num_peft_tokens = bc->num_active_peft_tokens();
-  inference_kernel_wrapper(m,
-                           input.ptr,
-                           output.ptr,
-                           weight_first.ptr,
-                           weight_second.ptr,
-                           in_dim,
-                           out_dim,
-                           rank,
-                           num_infr_tokens,
-                           num_peft_tokens);
+  // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  // int num_infr_tokens = bc->num_active_infr_tokens();
+  // int num_peft_tokens = bc->num_active_peft_tokens();
+  inference_kernel_wrapper(m, bc, input, output);
 }
 
 FutureMap LoraLinear::peft_bwd(FFModel const &ff,
@@ -468,30 +408,6 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[1]->region));
-  launcher.add_field(3, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region_grad));
-  launcher.add_field(4, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    weights[1]->region_grad));
-  launcher.add_field(5, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -512,39 +428,14 @@ void LoraLinear::peft_bwd_task(Task const *task,
 
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO(
-      m->weight_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW weight_first_grad = helperGetGenericTensorAccessorRW(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW weight_second_grad = helperGetGenericTensorAccessorRW(
-      m->weight_type[1], regions[5], task->regions[5], FID_DATA, ctx, runtime);
-
-  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
-  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
-  int rank = weight_first.domain.get_volume() / in_dim;
-  assert(in_dim * rank == weight_first.domain.get_volume());
-  assert(out_dim * rank == weight_second.domain.get_volume());
-  assert(weight_first.domain == weight_first_grad.domain);
-  assert(weight_second.domain == weight_second_grad.domain);
-
-  int num_infr_tokens = bc->num_active_infr_tokens();
-  int num_peft_tokens = bc->num_active_peft_tokens();
-  peft_bwd_kernel_wrapper(m,
-                          input_grad.ptr,
-                          output_grad.ptr,
-                          weight_first.ptr,
-                          weight_second.ptr,
-                          weight_first_grad.ptr,
-                          weight_second_grad.ptr,
-                          in_dim,
-                          out_dim,
-                          rank,
-                          num_infr_tokens,
-                          num_peft_tokens);
+
+  // int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  // int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  // int num_infr_tokens = bc->num_active_infr_tokens();
+  // int num_peft_tokens = bc->num_active_peft_tokens();
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
 }
 
 void LoraLinear::backward(FFModel const &ff) {
@@ -571,15 +462,12 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.rank == rhs.rank &&
-         lhs.data_type == rhs.data_type;
+  return lhs.layer_guid == rhs.layer_guid;
 }
 
 void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
-  sez.serialize(this->rank);
-  sez.serialize(this->data_type);
 }
 
 /* static */
@@ -589,18 +477,12 @@ Node LoraLinear::deserialize(FFModel &ff,
                              ParallelTensor inputs[],
                              int num_inputs) {
   assert(num_inputs == 2);
-  int rank;
-  DataType data_type;
   size_t id, transformer_layer_id;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   LayerID layer_guid(id, transformer_layer_id);
-  dez.deserialize(rank);
-  dez.deserialize(data_type);
 
   LoraLinearParams params;
-  params.rank = rank;
-  params.data_type = data_type;
   params.layer_guid = layer_guid;
   return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
 }
@@ -615,8 +497,6 @@ Op *LoraLinear::materialize(FFModel &ff,
 LoraLinearParams LoraLinear::get_params() const {
   LoraLinearParams params;
   params.layer_guid = this->layer_guid;
-  params.rank = this->rank;
-  params.data_type = this->data_type;
   return params;
 }
 
@@ -633,8 +513,6 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
     FlexFlow::LoraLinearParams const &params) const {
   size_t key = 0;
   hash_combine(key, params.layer_guid.id);
-  hash_combine(key, params.rank);
-  hash_combine(key, params.data_type);
   return key;
 }
 }; // namespace std
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 2b94f07999..4c24af85cf 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -25,4 +25,20 @@ bool operator==(LayerID const &lhs, LayerID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+const PEFTModelID PEFTModelID::NO_ID = PEFTModelID();
+
+PEFTModelID::PEFTModelID() : id(0) {}
+
+PEFTModelID::PEFTModelID(size_t _id) : id(_id) {
+  assert(is_valid_id());
+}
+
+bool PEFTModelID::is_valid_id() const {
+  return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID);
+}
+
+bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
+  return lhs.id == rhs.id;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2735513af2..50b9f5e402 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6235,6 +6235,22 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_REG_TASK_ID,
+                                   "LoraLinear Model Registration");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::register_model_task>(
+          registrar, "LoraLinear Model Registration Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::register_model_task>(
+          registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID,
                                    "LoraLinear Inference");

From ea8920b02af693b364f2a7986a5ce9e761ed4f11 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sat, 7 Oct 2023 23:38:28 -0400
Subject: [PATCH 009/198] add API for registering PEFT models

---
 include/flexflow/model.h           |  7 ++-
 include/flexflow/ops/lora_linear.h |  1 -
 include/flexflow/request_manager.h | 13 +++--
 inference/models/llama.cc          |  2 +-
 src/ops/lora_linear.cc             |  3 +-
 src/runtime/model.cc               |  1 +
 src/runtime/request_manager.cc     | 80 ++++++++++++++++++++++++++----
 7 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index cc8d2267cf..f98456a268 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -815,7 +815,10 @@ class FFModel {
   // Inference APIs
   // ========================================
   GenerationResult generate(std::vector<std::string> &prompts,
-                            int max_seq_length);
+                            int max_seq_length,
+                            PEFTModelID peft_model_id = PEFTModelID::NO_ID);
+
+  PEFTModelID register_peft_model(std::map<std::string, int> config);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
@@ -1112,7 +1115,7 @@ class FFModel {
   void clear_graph_search_cache();
 
 public:
-  size_t op_global_guid, layer_global_guid;
+  size_t op_global_guid, layer_global_guid, peft_model_global_guid;
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 39d8925262..23dc8ec496 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -41,7 +41,6 @@ class LoraLinear : public Op {
   void register_peft_model(FFModel const &ff,
                            std::vector<ParallelTensor> const &batch_inputs,
                            std::vector<ParallelTensor> const &batch_outputs,
-                           MachineView const *mv,
                            PEFTModelID const &model_id,
                            int rank);
   Legion::FutureMap inference(FFModel const &,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3081aaa1c2..da64ac58a2 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -57,6 +57,7 @@ struct Request {
     FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
+  PEFTModelID peft_model_id;
   int max_sequence_length;
   int initial_len;
   int ssm_cache_size = 0;
@@ -112,15 +113,19 @@ class RequestManager {
 
   GenerationResult generate_incr_decoding(FFModel *model,
                                           std::vector<std::string> &prompts,
-                                          int max_seq_length);
+                                          int max_seq_length,
+                                          PEFTModelID peft_model_id);
   GenerationResult generate_spec_infer(FFModel *model,
                                        std::vector<std::string> &prompts,
-                                       int max_seq_length);
+                                       int max_seq_length,
+                                       PEFTModelID peft_model_id);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length);
+                                   int max_sequence_length,
+                                   PEFTModelID peft_model_id);
   RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length);
+                                   int max_sequence_length,
+                                   PEFTModelID peft_model_id);
   bool is_request_completed(RequestGuid const &guid);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index f90040170e..2fe5642507 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -220,7 +220,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(multi, w2);
+    ff.lora_linear(multi, w2, "lora_mlp_linear_second");
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 665c5cb4c5..4c92d6cb6c 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -212,7 +212,6 @@ void LoraLinear::register_peft_model(
     FFModel const &ff,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv,
     PEFTModelID const &model_id,
     int rank) {
   assert(check_output_input_weight_same_parallel_is());
@@ -229,7 +228,7 @@ void LoraLinear::register_peft_model(
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  MachineView const *view = &output_tensor->machine_view;
   size_t machine_view_hash = view->hash();
   set_argumentmap_for_inference(ff, argmap, output_tensor);
   LoraLinearRegisterInfo info;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 50b9f5e402..91361e0cc7 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1535,6 +1535,7 @@ FFRuntime *ffruntime_singleton = nullptr;
 FFModel::FFModel(FFConfig &_config, bool cpu_offload)
     : op_global_guid(OP_GUID_FIRST_VALID),
       layer_global_guid(LAYER_GUID_FIRST_VALID),
+      peft_model_global_guid(PEFT_MODEL_ID_FIRST_VALID),
       tensor_global_guid(TENSOR_GUID_FIRST_VALID),
       parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID),
       node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0),
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1b825318dd..1616054148 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/request_manager.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
 #include <filesystem>
@@ -175,7 +176,8 @@ size_t RequestManager::get_num_ssms() {
 
 RequestManager::RequestGuid
     RequestManager::register_new_request(std::vector<TokenId> const &prompt,
-                                         int max_sequence_length) {
+                                         int max_sequence_length,
+                                         PEFTModelID peft_model_id) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
 
   // Add a new request
@@ -183,6 +185,7 @@ RequestManager::RequestGuid
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
   request.max_sequence_length = max_sequence_length;
+  request.peft_model_id = peft_model_id;
 
   if (prompt.size() >= get_max_sequence_length()) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
@@ -231,13 +234,15 @@ RequestManager::RequestGuid
 
 RequestManager::RequestGuid
     RequestManager::register_new_request(std::string const &prompt,
-                                         int max_sequence_length) {
+                                         int max_sequence_length,
+                                         PEFTModelID peft_model_id) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
   request.max_sequence_length = max_sequence_length;
+  request.peft_model_id = peft_model_id;
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
@@ -439,6 +444,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
       new_bc.requestsInfo[i].max_sequence_length =
           old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].peft_model_id =
+          old_bc.requestsInfo[i].peft_model_id;
       if (new_bc.requestsInfo[i].token_start_offset + 1 ==
           request.tokens.size()) {
         // Incremental phase
@@ -477,6 +484,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_sequence_length =
             new_request.max_sequence_length;
+        new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
         new_bc.request_completed[i] = false;
         // add profile_info for the new request
         ProfileInfo profile_info;
@@ -1795,24 +1803,71 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 }
 
 GenerationResult FFModel::generate(std::vector<std::string> &prompts,
-                                   int max_seq_length) {
+                                   int max_seq_length,
+                                   PEFTModelID peft_model_id) {
   RequestManager *rm = RequestManager::get_request_manager();
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
-    return rm->generate_incr_decoding(this, prompts, max_seq_length);
+    return rm->generate_incr_decoding(
+        this, prompts, max_seq_length, peft_model_id);
   } else {
     // Registered SSMs: perform speculative inference
-    return rm->generate_spec_infer(this, prompts, max_seq_length);
+    return rm->generate_spec_infer(
+        this, prompts, max_seq_length, peft_model_id);
   }
 }
 
+PEFTModelID FFModel::register_peft_model(std::map<std::string, int> configs) {
+  PEFTModelID peft_model_id(peft_model_global_guid++);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  for (size_t op = 0; op < operators.size(); op++) {
+    if (operators[op]->op_type == OP_LORA_LINEAR) {
+      std::string opname(operators[op]->name);
+      // Remove the guid and the ``_'' char from opname: guid has 7 digits
+      // and ``_'' occupies 1 char
+      opname.erase(opname.length() - 8);
+      assert(configs.find(opname) != configs.end());
+      int rank = configs[opname];
+      LoraLinear *lora = static_cast<LoraLinear *>(operators[op]);
+      // Currently assume only a single data pipeline
+      assert(config.data_parallelism_degree == 1);
+      std::vector<ParallelTensor> inputs(lora->numInputs);
+      std::vector<ParallelTensor> outputs(lora->numOutputs);
+
+      for (int i = 0; i < lora->numInputs; i++) {
+        assert(im->tensor_buffer.find(lora->inputs[i]) !=
+               im->tensor_buffer.end());
+        assert(lora->inputs[i] != nullptr);
+        assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+        assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
+        inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
+        assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      }
+      for (int i = 0; i < lora->numOutputs; i++) {
+        assert(im->tensor_buffer.find(lora->outputs[i]) !=
+               im->tensor_buffer.end());
+        assert(lora->outputs[i] != nullptr);
+        assert(lora->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+        assert(im->tensor_buffer[lora->outputs[i]].size() == 1);
+        outputs[i] = im->tensor_buffer[lora->outputs[i]][0];
+        assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      }
+      lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank);
+    }
+  }
+  return peft_model_id;
+}
+
 /*static*/
-GenerationResult RequestManager::generate_incr_decoding(
-    FFModel *llm, std::vector<std::string> &prompts, int max_seq_length) {
+GenerationResult
+    RequestManager::generate_incr_decoding(FFModel *llm,
+                                           std::vector<std::string> &prompts,
+                                           int max_seq_length,
+                                           PEFTModelID peft_model_id) {
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestGuid guid;
   for (int i = 0; i < prompts.size(); i++) {
-    guid = register_new_request(prompts.at(i), max_seq_length);
+    guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id);
   }
 
   if (guid == 0) {
@@ -1864,12 +1919,15 @@ GenerationResult RequestManager::generate_incr_decoding(
 }
 
 /*static*/
-GenerationResult RequestManager::generate_spec_infer(
-    FFModel *llm, std::vector<std::string> &prompts, int max_seq_length) {
+GenerationResult
+    RequestManager::generate_spec_infer(FFModel *llm,
+                                        std::vector<std::string> &prompts,
+                                        int max_seq_length,
+                                        PEFTModelID peft_model_id) {
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestGuid guid;
   for (int i = 0; i < prompts.size(); i++) {
-    guid = register_new_request(prompts.at(i), max_seq_length);
+    guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id);
   }
   if (guid == 0) {
     std::cout

From 44cc16b314d4241f6303519a00537a20cc66c3b2 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sun, 8 Oct 2023 15:52:47 -0400
Subject: [PATCH 010/198] bug fix

---
 src/ops/fused.cu                       | 11 ++---------
 src/ops/kernels/lora_linear_kernels.cu |  5 ++---
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index f6d8365f1f..948b8c0885 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -707,23 +707,16 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         Domain input_domain = my_input_accessor[0].domain;
         Domain output_domain = my_output_accessor[0].domain;
-        Domain weight_first_domain = my_weight_accessor[0].domain;
-        Domain weight_second_domain = my_weight_accessor[1].domain;
         int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
         int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
-        int rank = weight_first_domain.get_volume() / in_dim;
-        assert(in_dim * rank == weight_first_domain.get_volume());
-        assert(out_dim * rank == weight_second_domain.get_volume());
         int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
-        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
         LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] == 2);
         assert(m->input_type[0] == my_input_accessor[0].data_type);
         assert(m->output_type[0] == my_output_accessor[0].data_type);
-        int num_infr_tokens = bc->num_active_infr_tokens();
-        int num_peft_tokens = bc->num_active_peft_tokens();
         // Assert that the output and the second input are at the same place
         // since we ``inplace'' the output for LoRA
         assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 1e9069fa72..ab1ae1b49d 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -141,11 +141,10 @@ void inference_kernel(LoraLinearMeta *m,
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   DT alpha = 1.0f, beta = 0.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
-  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
-  assert(m->weight_type[1] == m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
   cudaDataType_t lr_actv_type = output_type;
-  assert(input_type == weight_type && weight_type == output_type);
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
 
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance

From 29e5547cb7d4381db129131d52a14345dfb94b22 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sun, 8 Oct 2023 15:53:13 -0400
Subject: [PATCH 011/198] format

---
 src/ops/fused.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 948b8c0885..d70d01013c 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -712,8 +712,7 @@ __host__ void
         int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               in_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
         LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
         assert(m->input_type[0] == my_input_accessor[0].data_type);
         assert(m->output_type[0] == my_output_accessor[0].data_type);

From dfd1c9a0a8e28e937445fb0fdd4ea0786ca7c2f7 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sun, 8 Oct 2023 18:03:58 -0400
Subject: [PATCH 012/198] add reserved work space for peft activations and
 weights

---
 include/flexflow/config.h                |  8 ++-
 inference/incr_decoding/incr_decoding.cc |  7 +-
 src/ops/kernels/lora_linear_kernels.cu   | 11 +++-
 src/ops/lora_linear.cc                   |  4 +-
 src/runtime/model.cc                     | 12 ++++
 src/runtime/model.cu                     | 50 ++++++++++++++
 src/runtime/request_manager.cc           | 83 +++++++++++++++---------
 7 files changed, 140 insertions(+), 35 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 60d1cb17d2..6fd4b957dc 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -80,10 +80,10 @@ struct FFHandler {
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   // PEFT related fields
-  void *peft_activation_reserve_space;
-  size_t peft_activation_reserve_space_size;
   MemoryAllocator *peft_activation_allocator;
+  size_t peft_activation_reserve_space_size;
   PEFTWeightAllocator *peft_weight_allocator;
+  size_t peft_weight_reserve_space_size;
   // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
@@ -96,6 +96,7 @@ struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
   size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -151,6 +152,9 @@ class FFConfig {
   bool cpu_offload;
   size_t offload_reserve_space_size;
   DataType quantization_type;
+  // PEFT related fields
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 463bc10151..277d86c9cc 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -257,6 +257,11 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  // Register PEFT layer
+  std::map<std::string, int> peft_config;
+  peft_config["lora_mlp_linear_second"] = 4;
+  PEFTModelID peft_model_id = model.register_peft_model(peft_config);
+
   int total_num_requests = 0;
   {
     using json = nlohmann::json;
@@ -274,7 +279,7 @@ void FlexFlow::top_level_task(Task const *task,
       prompts.push_back(text);
     }
     GenerationResult result =
-        model.generate(prompts, 128 /*max_sequence_length*/);
+        model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id);
   }
 
   // Execution fence
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index ab1ae1b49d..eab98a24e7 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -183,6 +183,7 @@ void inference_kernel(LoraLinearMeta *m,
     LoraLinearWeight weight =
         m->model_weights[bc->requestsInfo[i].peft_model_id];
     int rank = weight.rank;
+    void *intermediate_result_ptr = nullptr;
     if (bc->requestsInfo[i].peft_bwd) {
       MemoryAllocator *allocator = m->handle.peft_activation_allocator;
       m->input_activation = allocator->allocate_instance_untyped(
@@ -196,6 +197,12 @@ void inference_kernel(LoraLinearMeta *m,
                                     num_peft_tokens * in_dim,
                                 cudaMemcpyDeviceToDevice,
                                 stream));
+      intermediate_result_ptr = m->low_rank_activation;
+    } else {
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >=
+             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      intermediate_result_ptr = m->handle.workSpace;
     }
     // buffer = weight_first * input
     checkCUDA(cublasGemmEx(m->handle.blas,
@@ -212,7 +219,7 @@ void inference_kernel(LoraLinearMeta *m,
                            input_type,
                            in_dim,
                            &beta,
-                           m->low_rank_activation,
+                           intermediate_result_ptr,
                            lr_actv_type,
                            rank,
                            compute_type,
@@ -230,7 +237,7 @@ void inference_kernel(LoraLinearMeta *m,
                            weight.w1_ptr,
                            weight_type,
                            rank,
-                           m->low_rank_activation,
+                           intermediate_result_ptr,
                            lr_actv_type,
                            rank,
                            &alpha,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 4c92d6cb6c..17ab2d659b 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -261,8 +261,10 @@ void LoraLinear::register_model_task(Task const *task,
   int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
   DataType dt = m->input_type[0];
   assert(dt == m->input_type[1]);
-  assert(dt == m->output_type[1]);
+  assert(dt == m->output_type[0]);
   assert(dt == lora->inputs[0]->data_type);
+  assert(dt == lora->inputs[1]->data_type);
+  assert(dt == lora->outputs[0]->data_type);
   assert(m->model_weights.find(info->model_id) == m->model_weights.end());
   LoraLinearWeight weight;
   PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 91361e0cc7..e74e5e11aa 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1508,6 +1508,9 @@ FFRuntime::FFRuntime(FFConfig &config) {
     info.workSpaceSize = config.workSpaceSize;
     info.offload_reserve_space_size =
         config.cpu_offload ? config.offload_reserve_space_size : 0;
+    info.peft_activation_reserve_space_size =
+        config.peft_activation_reserve_space_size;
+    info.peft_weight_reserve_space_size = config.peft_weight_reserve_space_size;
     info.quantization_type = config.quantization_type;
     info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion;
     argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo)));
@@ -3991,6 +3994,11 @@ struct DefaultConfig {
   const static bool searchOverlapBackwardUpdate = false;
   const static size_t offloadReserveSpaceSize =
       (size_t)8 * 1024 * 1024 * 1024; // 8 GB
+  // PEFT related fields
+  const static size_t peftActivationReserveSpaceSize =
+      (size_t)1 * 1024 * 1024 * 1024; // 1GB
+  const static size_t peftWeightReserveSpaceSize =
+      (size_t)1 * 1024 * 1024 * 1024; // 1GB
   const static bool cpuOffload = false;
   const static bool onlyDataParallel = true;
   const static bool enableSampleParallel = true;
@@ -4025,6 +4033,10 @@ FFConfig::FFConfig() {
   computationMode = COMP_MODE_TRAINING;
   cpu_offload = DefaultConfig::cpuOffload;
   offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize;
+  // PEFT related fields
+  peft_activation_reserve_space_size =
+      DefaultConfig::peftActivationReserveSpaceSize;
+  peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize;
   quantization_type = DT_NONE;
   only_data_parallel = DefaultConfig::onlyDataParallel;
   data_parallelism_degree = 1;
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 17401a0f14..0c69c9a600 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -14,6 +14,8 @@
  */
 #include "flexflow/model.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "flexflow/utils/memory_allocator.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -152,6 +154,54 @@ FFHandler
     handle.offload_reserve_space = nullptr;
   }
 
+  if (info->peft_activation_reserve_space_size > 0) {
+    // allocate memory for peft activation reserve space
+    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                         .only_kind(Memory::GPU_FB_MEM)
+                         .best_affinity_to(task->target_proc)
+                         .first();
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(info->peft_activation_reserve_space_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance workspaceInst;
+    Realm::RegionInstance::create_instance(workspaceInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.peft_activation_allocator = new MemoryAllocator(gpu_mem);
+    handle.peft_activation_allocator->register_reserved_work_space(
+        ptr, info->peft_activation_reserve_space_size);
+  }
+
+  if (info->peft_weight_reserve_space_size > 0) {
+    // allocate memory for peft weight reserve space
+    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                         .only_kind(Memory::GPU_FB_MEM)
+                         .best_affinity_to(task->target_proc)
+                         .first();
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance workspaceInst;
+    Realm::RegionInstance::create_instance(workspaceInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.peft_weight_allocator =
+        new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size);
+  }
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1616054148..05eb3bb554 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/request_manager.h"
+#include "flexflow/ops/fused.h"
 #include "flexflow/ops/lora_linear.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
@@ -1817,42 +1818,66 @@ GenerationResult FFModel::generate(std::vector<std::string> &prompts,
   }
 }
 
+std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
+  for (size_t i = 0; i < model->layers.size(); i++) {
+    if (model->layers[i]->layer_guid == guid) {
+      std::string layer_name(model->layers[i]->name);
+      return layer_name;
+    }
+  }
+  assert(false);
+  return "invalid_layer_name";
+}
+
 PEFTModelID FFModel::register_peft_model(std::map<std::string, int> configs) {
   PEFTModelID peft_model_id(peft_model_global_guid++);
   InferenceManager *im = InferenceManager::get_inference_manager();
+  std::vector<Op *> peft_operators;
   for (size_t op = 0; op < operators.size(); op++) {
     if (operators[op]->op_type == OP_LORA_LINEAR) {
-      std::string opname(operators[op]->name);
-      // Remove the guid and the ``_'' char from opname: guid has 7 digits
-      // and ``_'' occupies 1 char
-      opname.erase(opname.length() - 8);
-      assert(configs.find(opname) != configs.end());
-      int rank = configs[opname];
-      LoraLinear *lora = static_cast<LoraLinear *>(operators[op]);
-      // Currently assume only a single data pipeline
-      assert(config.data_parallelism_degree == 1);
-      std::vector<ParallelTensor> inputs(lora->numInputs);
-      std::vector<ParallelTensor> outputs(lora->numOutputs);
-
-      for (int i = 0; i < lora->numInputs; i++) {
-        assert(im->tensor_buffer.find(lora->inputs[i]) !=
-               im->tensor_buffer.end());
-        assert(lora->inputs[i] != nullptr);
-        assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-        assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
-        inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
-        assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      peft_operators.push_back(operators[op]);
+    } else if (operators[op]->op_type == OP_FUSED) {
+      FusedOp *fused = static_cast<FusedOp *>(operators[op]);
+      for (size_t op2 = 0; op2 < fused->numOperators; op2++) {
+        if (fused->operators[op2]->op_type == OP_LORA_LINEAR) {
+          peft_operators.push_back(fused->operators[op2]);
+        }
+      }
+    }
+  }
+  for (size_t op = 0; op < peft_operators.size(); op++) {
+    std::string layer_name =
+        find_layer_name_from_guid(this, peft_operators[op]->layer_guid);
+    switch (peft_operators[op]->op_type) {
+      case OP_LORA_LINEAR: {
+        // Remove the guid and the ``_'' char from opname: guid has 7 digits
+        // and ``_'' occupies 1 char
+        layer_name = layer_name.erase(layer_name.length() - 8);
+        assert(configs.find(layer_name) != configs.end());
+        int rank = configs[layer_name];
+        LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
+        // Currently assume only a single data pipeline
+        assert(config.data_parallelism_degree == 1);
+        std::vector<ParallelTensor> inputs(lora->numInputs);
+        std::vector<ParallelTensor> outputs(lora->numOutputs);
+
+        for (int i = 0; i < lora->numInputs; i++) {
+          assert(im->tensor_buffer.find(lora->inputs[i]) !=
+                 im->tensor_buffer.end());
+          assert(lora->inputs[i] != nullptr);
+          assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+          assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
+          inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
+          assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+        }
+        assert(lora->numOutputs == 1);
+        outputs[0] = inputs[1];
+        lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank);
+        break;
       }
-      for (int i = 0; i < lora->numOutputs; i++) {
-        assert(im->tensor_buffer.find(lora->outputs[i]) !=
-               im->tensor_buffer.end());
-        assert(lora->outputs[i] != nullptr);
-        assert(lora->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
-        assert(im->tensor_buffer[lora->outputs[i]].size() == 1);
-        outputs[i] = im->tensor_buffer[lora->outputs[i]][0];
-        assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      default: {
+        assert(false && "Unsupported PEFT Operator type");
       }
-      lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank);
     }
   }
   return peft_model_id;

From e6f671d076a0ae08709081ef1f4d8f1b51802c83 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 11 Oct 2023 01:41:17 -0400
Subject: [PATCH 013/198] fix merge conflicts, implement layernorm peft_bwd

---
 include/flexflow/fftype.h               |   2 +
 include/flexflow/model.h                |   1 +
 include/flexflow/ops/layer_norm.h       |  28 +++
 src/ops/fused.cu                        |  10 +-
 src/ops/layer_norm.cc                   | 180 +++++++++++++---
 src/ops/layer_norm.cpp                  |  62 ++++--
 src/ops/layer_norm.cu                   | 274 ++++++++++++++++--------
 src/ops/linear.cc                       |   2 +-
 src/ops/lora_linear.cc                  |   8 +-
 src/runtime/batch_config.cc             |   9 +-
 src/runtime/beam_search_batch_config.cc |   3 +
 src/runtime/fftype.cc                   |   9 +
 src/runtime/model.cc                    |  15 ++
 src/runtime/tree_verify_batch_config.cc |   3 +
 14 files changed, 467 insertions(+), 139 deletions(-)

diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 2722e00f9c..099b58c82e 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -26,6 +26,8 @@ class PEFTModelID {
   PEFTModelID(size_t id);
   bool is_valid_id() const;
   friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  PEFTModelID const &peft_model_id);
 
 public:
   size_t id;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 0f33d2c7ea..e2530bcc90 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -106,6 +106,7 @@ enum TaskIDs {
   LAYERNORM_FWD_TASK_ID,
   LAYERNORM_INF_TASK_ID,
   LAYERNORM_BWD_TASK_ID,
+  LAYERNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_LAYERNORM_INIT_TASK_ID,
   RESIDUAL_LAYERNORM_INF_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h
index 9e48d81190..389b3e718a 100644
--- a/include/flexflow/ops/layer_norm.h
+++ b/include/flexflow/ops/layer_norm.h
@@ -37,6 +37,11 @@ class LayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -67,6 +72,10 @@ class LayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
@@ -86,6 +95,12 @@ class LayerNorm : public Op {
                                      GenericTensorAccessorW &output,
                                      GenericTensorAccessorR const &gamma,
                                      GenericTensorAccessorR const &beta);
+  static void inference_kernel_wrapper(LayerNormMeta *m,
+                                       BatchConfig const *bc,
+                                       GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorW &output,
+                                       GenericTensorAccessorR const &gamma,
+                                       GenericTensorAccessorR const &beta);
   template <typename T>
   static void backward_kernel(LayerNormMeta const *m,
                               T const *output_grad_ptr,
@@ -103,6 +118,17 @@ class LayerNorm : public Op {
                                       T const *gamma_ptr,
                                       T *gamma_grad_ptr,
                                       T *beta_grad_ptr);
+  template <typename T>
+  static void peft_bwd_kernel(LayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T *input_grad_ptr,
+                              T const *gamma_ptr,
+                              ffStream_t stream);
+  template <typename T>
+  static void peft_bwd_kernel_wrapper(LayerNormMeta const *m,
+                                      T const *output_grad_ptr,
+                                      T *input_grad_ptr,
+                                      T const *gamma_ptr);
 
 public:
   bool elementwise_affine, use_bias;
@@ -124,6 +150,8 @@ class LayerNormMeta : public OpMeta {
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 071078b324..9aa4291453 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1127,14 +1127,20 @@ __host__ void
       }
       for (int i = 0; i < fused->op_num_weights[op]; i++) {
         assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(weight_accessor[fused->op_weight_idx[i + woff]]);
+        weight_accessors_to_save.push_back(
+            weight_accessor[fused->op_weight_idx[i + woff]]);
       }
       for (int i = 0; i < fused->op_num_outputs[op]; i++) {
         output_accessors_to_save.push_back(output_accessor[i + ooff]);
       }
       assert(task->index_point.get_dim() == 1);
       int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op], shard_id, bc, input_accessors_to_save, weight_accessors_to_save, output_accessors_to_save);
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save);
     }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index bc1358e49c..784e40c598 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/layer_norm.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/hash_utils.h"
 #include "legion/legion_utilities.h"
@@ -561,7 +562,7 @@ void LayerNorm::inference_task(Task const *task,
     assert(regions.size() == 2);
   }
 
-  LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta);
+  LayerNorm::inference_kernel_wrapper(m, bc, in, out, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -645,6 +646,115 @@ void LayerNorm::forward_task(Task const *task,
   LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta);
 }
 
+Legion::FutureMap
+    LayerNorm::peft_bwd(FFModel const &ff,
+                        BatchConfigFuture const &bc,
+                        std::vector<ParallelTensor> const &batch_inputs,
+                        std::vector<ParallelTensor> const &batch_outputs,
+                        MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I/O): input_grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(2, FID_DATA);
+  if (elementwise_affine) {
+    // regions[2](I): gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(3, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I/O): input_grad
+  regions[2](I): gamma
+*/
+void LayerNorm::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
+  assert(task->regions.size() == regions.size());
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  // GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+  //     m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx,
+  //     runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  Domain out_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  // Domain in_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[1].region.get_index_space());
+  Domain in_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  // assert(in_domain == out_grad_domain);
+  //  assert(in_domain.get_volume() ==
+  //         m->effective_num_elements * m->effective_batch_size);
+
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 3));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[2],
+                                             task->regions[2],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    Domain gamma_domain = runtime->get_index_space_domain(
+        ctx, task->regions[2].region.get_index_space());
+
+    assert(gamma_domain.get_volume() == m->effective_num_elements);
+  } else {
+    assert(regions.size() == 2);
+  }
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::peft_bwd_kernel_wrapper<float>(m,
+                                              output_grad.get_float_ptr(),
+                                              // input.get_float_ptr(),
+                                              input_grad.get_float_ptr(),
+                                              gamma.get_float_ptr());
+  } else {
+    LayerNorm::peft_bwd_kernel_wrapper<half>(m,
+                                             output_grad.get_half_ptr(),
+                                             // input.get_half_ptr(),
+                                             input_grad.get_half_ptr(),
+                                             gamma.get_half_ptr());
+  }
+}
+
 void LayerNorm::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -722,55 +832,75 @@ void LayerNorm::backward_task(Task const *task,
                               Runtime *runtime) {
   LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
   assert(task->regions.size() == regions.size());
-  float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL;
-  float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
   Domain out_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  out_grad_ptr = helperGetTensorPointerRO<float>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
-  in_ptr = helperGetTensorPointerRO<float>(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
   Domain in_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[2].region.get_index_space());
-  in_grad_ptr = helperGetTensorPointerRW<float>(
-      regions[2], task->regions[2], FID_DATA, ctx, runtime);
   assert(in_domain == out_grad_domain);
   assert(in_domain.get_volume() ==
          m->effective_num_elements * m->effective_batch_size);
+
   if (m->elementwise_affine) {
     assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[3],
+                                             task->regions[3],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad = helperGetGenericTensorAccessorRW(m->output_type[0],
+                                                  regions[4],
+                                                  task->regions[4],
+                                                  FID_DATA,
+                                                  ctx,
+                                                  runtime);
     Domain gamma_domain = runtime->get_index_space_domain(
         ctx, task->regions[3].region.get_index_space());
-    gamma_ptr = helperGetTensorPointerRO<float>(
-        regions[3], task->regions[3], FID_DATA, ctx, runtime);
     Domain gamma_grad_domain = runtime->get_index_space_domain(
         ctx, task->regions[4].region.get_index_space());
-    gamma_grad_ptr = helperGetTensorPointerRW<float>(
-        regions[4], task->regions[4], FID_DATA, ctx, runtime);
     if (m->use_bias) {
       Domain beta_grad_domain = runtime->get_index_space_domain(
           ctx, task->regions[5].region.get_index_space());
-      beta_grad_ptr = helperGetTensorPointerRW<float>(
-          regions[5], task->regions[5], FID_DATA, ctx, runtime);
+      beta_grad = helperGetGenericTensorAccessorRW(m->output_type[0],
+                                                   regions[5],
+                                                   task->regions[5],
+                                                   FID_DATA,
+                                                   ctx,
+                                                   runtime);
       assert(gamma_domain == beta_grad_domain);
     }
-
     assert(gamma_domain == gamma_grad_domain);
-
     assert(gamma_domain.get_volume() == m->effective_num_elements);
   } else {
     assert(regions.size() == 3);
   }
-
-  LayerNorm::backward_kernel_wrapper<float>(m,
-                                            out_grad_ptr,
-                                            in_ptr,
-                                            in_grad_ptr,
-                                            gamma_ptr,
-                                            gamma_grad_ptr,
-                                            beta_grad_ptr);
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel_wrapper<float>(m,
+                                              output_grad.get_float_ptr(),
+                                              input.get_float_ptr(),
+                                              input_grad.get_float_ptr(),
+                                              gamma.get_float_ptr(),
+                                              gamma_grad.get_float_ptr(),
+                                              beta_grad.get_float_ptr());
+  } else {
+    LayerNorm::backward_kernel_wrapper<half>(m,
+                                             output_grad.get_half_ptr(),
+                                             input.get_half_ptr(),
+                                             input_grad.get_half_ptr(),
+                                             gamma.get_half_ptr(),
+                                             gamma_grad.get_half_ptr(),
+                                             beta_grad.get_half_ptr());
+  }
 }
 
 bool LayerNorm::measure_operator_cost(Simulator *sim,
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 07dbdb3dfb..9beb655d1d 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -236,13 +236,13 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
 }
 
 template <typename T>
-__global__ void LayerNormBackwardCUDAKenrel(int64_t N,
+__global__ void LayerNormBackwardCUDAKernel(int64_t N,
                                             T const *dY,
                                             T const *X,
                                             T const *gamma,
-                                            T const *a,
-                                            T const *b,
-                                            T const *c,
+                                            T const *dY_scale,
+                                            T const *X_scale,
+                                            T const *bias,
                                             T *dX) {
   using T_ACC = T;
   const int64_t i = blockIdx.x;
@@ -250,9 +250,9 @@ __global__ void LayerNormBackwardCUDAKenrel(int64_t N,
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    dX[index] =
-        static_cast<T_ACC>(a[i]) * static_cast<T_ACC>(dY[index]) * gamma_v +
-        b[i] * static_cast<T_ACC>(X[index]) + c[i];
+    dX[index] = static_cast<T_ACC>(dY_scale[i]) *
+                    static_cast<T_ACC>(dY[index]) * gamma_v +
+                X_scale[i] * static_cast<T_ACC>(X[index]) + bias[i];
   }
 }
 
@@ -532,6 +532,19 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m,
                          beta_grad_ptr);
     }
   }
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     input_ptr,
+                     gamma_ptr,
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr),
+                     input_grad_ptr);
 }
 
 /*static*/
@@ -545,14 +558,25 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
                                         T *beta_grad_ptr) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  LayerNorm::backward_kernel<float>(m,
-                                    output_grad_ptr,
-                                    input_ptr,
-                                    input_grad_ptr,
-                                    gamma_ptr,
-                                    gamma_grad_ptr,
-                                    beta_grad_ptr,
-                                    stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel<float>(m,
+                                      output_grad_ptr,
+                                      input_ptr,
+                                      input_grad_ptr,
+                                      gamma_ptr,
+                                      gamma_grad_ptr,
+                                      beta_grad_ptr,
+                                      stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    LayerNorm::backward_kernel<half>(m,
+                                     output_grad_ptr,
+                                     input_ptr,
+                                     input_grad_ptr,
+                                     gamma_ptr,
+                                     gamma_grad_ptr,
+                                     beta_grad_ptr,
+                                     stream);
+  }
 }
 
 template void
@@ -563,5 +587,13 @@ template void
                                               float const *gamma_ptr,
                                               float *gamma_grad_ptr,
                                               float *beta_grad_ptr);
+template void
+    LayerNorm::backward_kernel_wrapper<half>(LayerNormMeta const *m,
+                                             half const *output_grad_ptr,
+                                             half const *input_ptr,
+                                             half *input_grad_ptr,
+                                             half const *gamma_ptr,
+                                             half *gamma_grad_ptr,
+                                             half *beta_grad_ptr);
 
 }; // namespace FlexFlow
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 44979c48fe..cdf2ed433f 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -115,54 +115,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
   return val;
 }
 
-#ifdef DEADCODE
-template <typename T>
-__global__ void RowwiseMomentsCUDAKernel(
-    int64_t N, float eps, T const *X, T *mean, T *rstd) {
-  __shared__ float m_shared[C10_WARP_SIZE];
-  __shared__ float v_shared[C10_WARP_SIZE];
-  const int64_t i = blockIdx.x;
-  float sum1 = 0.0f;
-  float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    sum1 += static_cast<float>(X[index]);
-    sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
-  }
-  sum1 = BlockReduceSum<float>(sum1, m_shared);
-  sum2 = BlockReduceSum<float>(sum2, v_shared);
-  if (threadIdx.x == 0) {
-    float const scale = float(1) / static_cast<float>(N);
-    sum1 *= scale;
-    sum2 = max(sum2 * scale - sum1 * sum1, float(0));
-    mean[i] = static_cast<T>(sum1);
-    rstd[i] = static_cast<T>(rsqrt(sum2 + eps));
-  }
-}
-
-template <typename T>
-__global__ void LayerNormForwardCUDAKernel(int64_t N,
-                                           T const *X,
-                                           T const *mean,
-                                           T const *rstd,
-                                           T const *gamma,
-                                           T const *beta,
-                                           T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    const T_ACC beta_v =
-        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
-    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
-                   static_cast<T_ACC>(rstd[i]) * gamma_v +
-               beta_v;
-  }
-}
-#endif
-
 template <typename T>
 __global__ void LayerNormFusedForwardKernel(int64_t N,
                                             float eps,
@@ -290,6 +242,109 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
   }
 }
 
+/*static*/
+void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
+                                         BatchConfig const *bc,
+                                         GenericTensorAccessorR const &input,
+                                         GenericTensorAccessorW &output,
+                                         GenericTensorAccessorR const &gamma,
+                                         GenericTensorAccessorR const &beta) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // check that at most one dimension after the first is > 1. TODO(goliaro):
+    // support case where this condition does not hold
+    int non_unit_dims_encountered = 0;
+    for (int i = 1; i < input.domain.get_dim(); i++) {
+      int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1;
+      if (dim_i > 1) {
+        non_unit_dims_encountered++;
+      }
+    }
+    assert(non_unit_dims_encountered <= 1);
+
+    // allocate space for all peft tokens
+    MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+    int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+    m->input_activation = allocator->allocate_instance_untyped(
+        data_type_size(m->input_type[0]) * bc->num_active_peft_tokens() *
+        in_dim);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests and PEFT forward-only requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+          !bc->requestsInfo[i].peft_bwd) {
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+
+      if (m->input_type[0] == DT_FLOAT) {
+        checkCUDA(cudaMemcpyAsync(
+            m->input_activation,
+            input.get_float_ptr() + tokens_previous_requests * in_dim,
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+            cudaMemcpyDeviceToDevice,
+            stream));
+      } else if (m->input_type[0] == DT_HALF) {
+        checkCUDA(cudaMemcpyAsync(
+            m->input_activation,
+            input.get_half_ptr() + tokens_previous_requests * in_dim,
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+            cudaMemcpyDeviceToDevice,
+            stream));
+      } else {
+        assert(false && "unsupport datatype in layernorm");
+      }
+    }
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
+  } else {
+    assert(false && "unsupport datatype in layernorm");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
+}
+
 template <typename T>
 __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
@@ -327,7 +382,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   using T_ACC = T;
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < M) {
-    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
     const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
                     static_cast<T_ACC>(rstd[index]) *
                     static_cast<T_ACC>(rstd[index]) *
@@ -338,27 +393,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   }
 }
 
-template <typename T>
-__global__ void LayerNormBackwardCUDAKenrel(int64_t N,
-                                            T const *dY,
-                                            T const *X,
-                                            T const *gamma,
-                                            T const *a,
-                                            T const *b,
-                                            T const *c,
-                                            T *dX) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    dX[index] =
-        static_cast<T_ACC>(a[i]) * static_cast<T_ACC>(dY[index]) * gamma_v +
-        b[i] * static_cast<T_ACC>(X[index]) + c[i];
-  }
-}
-
 template <typename T>
 __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
                                                   int64_t N,
@@ -618,6 +652,59 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m,
   }
 }
 
+/*static*/
+template <typename T>
+void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
+                                T const *output_grad_ptr,
+                                T *input_grad_ptr,
+                                T const *gamma_ptr,
+                                cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          static_cast<T *>(m->input_activation),
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
+}
+
+/*static*/
+template <typename T>
+void LayerNorm::peft_bwd_kernel_wrapper(LayerNormMeta const *m,
+                                        T const *output_grad_ptr,
+                                        T *input_grad_ptr,
+                                        T const *gamma_ptr) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  LayerNorm::peft_bwd_kernel<T>(
+      m, output_grad_ptr, input_grad_ptr, gamma_ptr, stream);
+}
+
 /*static*/
 template <typename T>
 void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
@@ -629,26 +716,14 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
                                         T *beta_grad_ptr) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  if (m->output_type[0] == DT_FLOAT) {
-    LayerNorm::backward_kernel<float>(m,
-                                      output_grad_ptr,
-                                      input_ptr,
-                                      input_grad_ptr,
-                                      gamma_ptr,
-                                      gamma_grad_ptr,
-                                      beta_grad_ptr,
-                                      stream);
-  }
-  // }else if(m->output_type[0] == DT_HALF){
-  //   LayerNorm::backward_kernel<half>(m,
-  //                                   output_grad_ptr,
-  //                                   input_ptr,
-  //                                   input_grad_ptr,
-  //                                   gamma_ptr,
-  //                                   gamma_grad_ptr,
-  //                                   beta_grad_ptr,
-  //                                   stream);
-  // }
+  LayerNorm::backward_kernel<T>(m,
+                                output_grad_ptr,
+                                input_ptr,
+                                input_grad_ptr,
+                                gamma_ptr,
+                                gamma_grad_ptr,
+                                beta_grad_ptr,
+                                stream);
 }
 
 template void
@@ -659,5 +734,24 @@ template void
                                               float const *gamma_ptr,
                                               float *gamma_grad_ptr,
                                               float *beta_grad_ptr);
+template void
+    LayerNorm::backward_kernel_wrapper<half>(LayerNormMeta const *m,
+                                             half const *output_grad_ptr,
+                                             half const *input_ptr,
+                                             half *input_grad_ptr,
+                                             half const *gamma_ptr,
+                                             half *gamma_grad_ptr,
+                                             half *beta_grad_ptr);
+
+template void
+    LayerNorm::peft_bwd_kernel_wrapper<float>(LayerNormMeta const *m,
+                                              float const *output_grad_ptr,
+                                              float *input_grad_ptr,
+                                              float const *gamma_ptr);
+template void
+    LayerNorm::peft_bwd_kernel_wrapper<half>(LayerNormMeta const *m,
+                                             half const *output_grad_ptr,
+                                             half *input_grad_ptr,
+                                             half const *gamma_ptr);
 
 }; // namespace FlexFlow
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index ccc997b8e4..05529a46ec 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -505,7 +505,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
   m->add_bias_only_once = linear->add_bias_only_once;
   m->profiling = linear->profiling;
   m->inference_debugging = linear->inference_debugging;
-  m->trainableInputs[0] = linear->trainableInputs[0];
+  m->trainable_inputs[0] = linear->trainable_inputs[0];
   m->weight_ptr_type = m->input_type[0];
   m->quantization_type = linear->quantization_type;
   m->offload = linear->offload;
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 17ab2d659b..be1015e065 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -469,6 +469,7 @@ bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
 void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->layer_guid.model_id);
 }
 
 /* static */
@@ -478,10 +479,11 @@ Node LoraLinear::deserialize(FFModel &ff,
                              ParallelTensor inputs[],
                              int num_inputs) {
   assert(num_inputs == 2);
-  size_t id, transformer_layer_id;
+  size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
-  LayerID layer_guid(id, transformer_layer_id);
+  dez.deserialize(deserialized_model_id);
+  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
   LoraLinearParams params;
   params.layer_guid = layer_guid;
@@ -514,6 +516,8 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
     FlexFlow::LoraLinearParams const &params) const {
   size_t key = 0;
   hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.layer_guid.transformer_layer_id);
+  hash_combine(key, params.layer_guid.model_id);
   return key;
 }
 }; // namespace std
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index badca4010e..32b9146f90 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -105,12 +105,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
   // Current values
   os << "Number of active tokens: " << bc.num_active_tokens() << std::endl;
-  os << "Number of inference tokens: " << bc.num_active_infr_tokens() << std::endl;
+  os << "Number of inference tokens: " << bc.num_active_infr_tokens()
+     << std::endl;
   os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl;
   os << "Number of requests: " << bc.num_active_requests() << std::endl;
-  // PEFT values
-  os << "PEFT Model ID: " << bc.peft_model_id << std::endl;
-  os << "PEFT bwd: " << bc.peft_bwd << std::endl;
 
   // Per-request info
   os << "Per-request info:\n";
@@ -122,6 +120,9 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index 811ef00ba2..ee89450eca 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -131,6 +131,9 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index e8c3d49a6a..8213726e8a 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -46,4 +46,13 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
+  if (peft_model_id == PEFTModelID::NO_ID) {
+    os << "NO_ID";
+  } else {
+    os << peft_model_id.id;
+  }
+  return os;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 476485414b..c23eb6c1d9 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5396,6 +5396,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "layernorm_peft_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LayerNorm::peft_bwd_task>(
+          registrar, "peft_bwd_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LayerNorm::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index cb68ecc5f1..666a76790c 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -52,6 +52,9 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;

From 207b127b38970c798aadfcdb2bbbb737f460a2a0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 11 Oct 2023 01:42:26 -0400
Subject: [PATCH 014/198] cleanup

---
 src/ops/layer_norm.cc | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 784e40c598..6409019dbe 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -706,24 +706,18 @@ void LayerNorm::peft_bwd_task(Task const *task,
                               Runtime *runtime) {
   LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
   assert(task->regions.size() == regions.size());
+  
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  // GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-  //     m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx,
-  //     runtime);
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR gamma;
   GenericTensorAccessorW gamma_grad, beta_grad;
+  
   Domain out_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  // Domain in_domain = runtime->get_index_space_domain(
-  //     ctx, task->regions[1].region.get_index_space());
   Domain in_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
-  // assert(in_domain == out_grad_domain);
-  //  assert(in_domain.get_volume() ==
-  //         m->effective_num_elements * m->effective_batch_size);
 
   if (m->elementwise_affine) {
     assert(m->use_bias == (regions.size() == 3));
@@ -743,13 +737,11 @@ void LayerNorm::peft_bwd_task(Task const *task,
   if (m->output_type[0] == DT_FLOAT) {
     LayerNorm::peft_bwd_kernel_wrapper<float>(m,
                                               output_grad.get_float_ptr(),
-                                              // input.get_float_ptr(),
                                               input_grad.get_float_ptr(),
                                               gamma.get_float_ptr());
   } else {
     LayerNorm::peft_bwd_kernel_wrapper<half>(m,
                                              output_grad.get_half_ptr(),
-                                             // input.get_half_ptr(),
                                              input_grad.get_half_ptr(),
                                              gamma.get_half_ptr());
   }

From 231e244e771c88f0447e69525e413004d07340c0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 11 Oct 2023 04:13:59 -0400
Subject: [PATCH 015/198] rms backward

---
 include/flexflow/model.h                      |   2 +
 .../flexflow/ops/kernels/rms_norm_kernels.h   |   7 +
 include/flexflow/ops/rms_norm.h               |   8 +
 src/ops/kernels/rms_norm_kernels.cu           | 175 ++++++++++++++----
 src/ops/layer_norm.cc                         |   4 +-
 src/ops/rms_norm.cc                           |  94 +++++++++-
 src/runtime/model.cc                          |  31 +++-
 7 files changed, 275 insertions(+), 46 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index e2530bcc90..8e0a264e8f 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -166,6 +166,8 @@ enum TaskIDs {
   RMSNORM_INIT_TASK_ID,
   RMSNORM_FWD_TASK_ID,
   RMSNORM_INF_TASK_ID,
+  RMSNORM_BWD_TASK_ID,
+  RMSNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
   BEAM_TOPK_INIT_TASK_ID,
diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 35c5aa69fa..5844880b4b 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -30,6 +30,7 @@ class RMSNormMeta : public OpMeta {
   float eps;
   void *rms_ptr;
   void *norm_ptr;
+  void *c2_ptr;
 
   float alpha;
   float beta;
@@ -46,6 +47,12 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &output);
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad);
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h
index 1dc940ebd3..c22caaf69b 100644
--- a/include/flexflow/ops/rms_norm.h
+++ b/include/flexflow/ops/rms_norm.h
@@ -73,6 +73,14 @@ class RMSNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 7c9f4a9f98..2ec503cfd1 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -115,47 +115,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
   return val;
 }
 
-#ifdef DEADCODE
-template <typename T>
-__global__ void
-    RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) {
-  __shared__ float v_shared[C10_WARP_SIZE];
-  long long const i = blockIdx.x;
-  float sum = 0.0f;
-  for (long long j = threadIdx.x; j < N; j += blockDim.x) {
-    long long const index = i * N + j;
-    sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
-  }
-  sum = BlockReduceSum<float>(sum,
-                              v_shared); // use BlockReduceSum() to sum X_ij^2
-
-  if (threadIdx.x == 0) {
-    rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
-  }
-}
-
-template <typename T>
-__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void elewise_apply_weights(int64_t batch_size,
-                                      int64_t in_dim,
-                                      T const *norm,
-                                      T const *weights,
-                                      T *output) {
-  CUDA_KERNEL_LOOP(i, batch_size * in_dim) {
-    output[i] = norm[i] * weights[i % in_dim];
-  }
-}
-#endif
-
 template <typename T>
 __global__ void RMSNormFusedForwardKernel(int64_t N,
                                           float eps,
@@ -261,6 +220,140 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ T ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += dY[index] * X[index] * gamma[j];
+  }
+  ds = BlockReduceSum<T>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast<T>((int)N);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    dX[index] = c1[i] * dY[index] * gamma[j] + c2[i] * X[index];
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(RMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *input_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->batch_size;
+  const int64_t N = m->num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->c2_ptr));
+
+  RMSNormBackwardCUDAKernel<T>
+      <<<M, kCUDANumThreads, 0, stream>>>(N,
+                                          output_grad_ptr,
+                                          input_ptr,
+                                          weight_ptr,
+                                          static_cast<T *>(m->rms_ptr),
+                                          static_cast<T *>(m->c2_ptr),
+                                          input_grad_ptr);
+  const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+  GammaBackwardCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          output_grad_ptr,
+                                          input_ptr,
+                                          static_cast<T *>(m->rms_ptr),
+                                          weight_grad_ptr);
+}
+
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(input_grad.data_type == input.data_type);
+  assert(weight_grad.data_type == weight.data_type);
+  assert(output_grad.data_type == input.data_type);
+  assert(weight.data_type == output_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 6409019dbe..b5ee66fdba 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -706,14 +706,14 @@ void LayerNorm::peft_bwd_task(Task const *task,
                               Runtime *runtime) {
   LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
   assert(task->regions.size() == regions.size());
-  
+
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR gamma;
   GenericTensorAccessorW gamma_grad, beta_grad;
-  
+
   Domain out_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   Domain in_grad_domain = runtime->get_index_space_domain(
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 2a34f83be2..83648b49cf 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -431,6 +431,98 @@ void RMSNorm::inference_task(Task const *task,
   }
 }
 
+void RMSNorm::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RMSNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I): input
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I/O): input_grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I/O): gamma_grad
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(4, FID_DATA);
+
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I): input
+  regions[2](I/O): input_grad
+  regions[3](I): weight
+  regions[4](I/O): weight_grad
+*/
+void RMSNorm::backward_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == 5);
+  assert(regions.size() == 5);
+  RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(
+      m, output_grad, input, input_grad, weight, weight_grad);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I): input
+  regions[2](I/O): input_grad
+  regions[3](I): weight
+  regions[4](I/O): weight_grad
+*/
+void RMSNorm::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {}
+
 void RMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
@@ -470,8 +562,6 @@ Op *RMSNorm::materialize(FFModel &ff,
   return new RMSNorm(ff, params, inputs[0], true, this->name);
 }
 
-void RMSNorm::backward(FFModel const &ff) {}
-
 bool RMSNorm::measure_operator_cost(Simulator *sim,
                                     MachineView const &mv,
                                     CostMetrics &cost_metrics) const {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index c23eb6c1d9..931173e5f3 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5363,7 +5363,36 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<RMSNorm::inference_task>(registrar);
     }
   }
-  // rms norm task
+  {
+    TaskVariantRegistrar registrar(RMSNORM_BWD_TASK_ID, "RMS Norm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<RMSNorm::backward_task>(
+          registrar, "RMS Norm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<RMSNorm::backward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RMSNORM_PEFT_BWD_TASK_ID,
+                                   "RMS Norm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<RMSNorm::peft_bwd_task>(
+          registrar, "RMS Norm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<RMSNorm::peft_bwd_task>(registrar);
+    }
+  }
+  // residual rms norm task
   {
     TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID,
                                    "Residual RMS Norm Init");

From 416c322c48fe32e1d889462effc69bb8d50f6272 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 11 Oct 2023 05:03:29 -0400
Subject: [PATCH 016/198] rms peft

---
 .../flexflow/ops/kernels/rms_norm_kernels.h   |  12 ++
 include/flexflow/ops/rms_norm.h               |   5 +
 src/ops/kernels/rms_norm_kernels.cu           | 165 ++++++++++++++++++
 src/ops/rms_norm.cc                           |  64 ++++++-
 4 files changed, 240 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 5844880b4b..72176f0383 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -39,6 +40,8 @@ class RMSNormMeta : public OpMeta {
   int batch_size;
   int num_elements;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
 };
 
 namespace Kernels {
@@ -47,12 +50,21 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &output);
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output);
 void backward_kernel_wrapper(RMSNormMeta const *m,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorR const &input,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &weight,
                              GenericTensorAccessorW const &weight_grad);
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight);
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h
index c22caaf69b..384404d8a0 100644
--- a/include/flexflow/ops/rms_norm.h
+++ b/include/flexflow/ops/rms_norm.h
@@ -34,6 +34,11 @@ class RMSNorm : public Op {
   void init(FFModel const &) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
                       std::vector<ParallelTensor> const &,
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 2ec503cfd1..ffb92613a5 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -220,6 +220,103 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  assert(output.data_type == input.data_type);
+  assert(weight.data_type == output.data_type);
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // check that at most one dimension after the first is > 1. TODO(goliaro):
+    // support case where this condition does not hold
+    int non_unit_dims_encountered = 0;
+    for (int i = 1; i < input.domain.get_dim(); i++) {
+      int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1;
+      if (dim_i > 1) {
+        non_unit_dims_encountered++;
+      }
+    }
+    assert(non_unit_dims_encountered <= 1);
+
+    // allocate space for all peft tokens
+    MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+    int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+    m->input_activation = allocator->allocate_instance_untyped(
+        data_type_size(input.data_type) * bc->num_active_peft_tokens() *
+        in_dim);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests and PEFT forward-only requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+          !bc->requestsInfo[i].peft_bwd) {
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+
+      if (input.data_type == DT_FLOAT) {
+        checkCUDA(cudaMemcpyAsync(
+            m->input_activation,
+            input.get_float_ptr() + tokens_previous_requests * in_dim,
+            data_type_size(input.data_type) * num_peft_tokens * in_dim,
+            cudaMemcpyDeviceToDevice,
+            stream));
+      } else if (input.data_type == DT_HALF) {
+        checkCUDA(cudaMemcpyAsync(
+            m->input_activation,
+            input.get_half_ptr() + tokens_previous_requests * in_dim,
+            data_type_size(input.data_type) * num_peft_tokens * in_dim,
+            cudaMemcpyDeviceToDevice,
+            stream));
+      } else {
+        assert(false && "unsupport datatype in layernorm");
+      }
+    }
+  }
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 template <typename T>
 __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
@@ -354,6 +451,74 @@ void backward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+template <typename T>
+void peft_bwd_kernel(RMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->batch_size;
+  const int64_t N = m->num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          static_cast<T *>(m->input_activation),
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->c2_ptr));
+  RMSNormBackwardCUDAKernel<T>
+      <<<M, kCUDANumThreads, 0, stream>>>(N,
+                                          output_grad_ptr,
+                                          static_cast<T *>(m->input_activation),
+                                          weight_ptr,
+                                          static_cast<T *>(m->rms_ptr),
+                                          static_cast<T *>(m->c2_ptr),
+                                          input_grad_ptr);
+}
+
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(output_grad.data_type == weight.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 83648b49cf..332472e8e4 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -422,7 +422,7 @@ void RMSNorm::inference_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input, weight, output);
+  inference_kernel_wrapper(m, bc, input, weight, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -511,17 +511,69 @@ void RMSNorm::backward_task(Task const *task,
       m, output_grad, input, input_grad, weight, weight_grad);
 }
 
+Legion::FutureMap
+    RMSNorm::peft_bwd(FFModel const &ff,
+                      BatchConfigFuture const &bc,
+                      std::vector<ParallelTensor> const &batch_inputs,
+                      std::vector<ParallelTensor> const &batch_outputs,
+                      MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I/O): input_grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(2, FID_DATA);
+  // regions[2](I): weight
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(3, FID_DATA);
+
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 /*
   regions[0](I): output_grad
-  regions[1](I): input
-  regions[2](I/O): input_grad
-  regions[3](I): weight
-  regions[4](I/O): weight_grad
+  regions[1](I/O): input_grad
+  regions[2](I): weight
 */
 void RMSNorm::peft_bwd_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
-                            Runtime *runtime) {}
+                            Runtime *runtime) {
+  assert(task->regions.size() == 3);
+  assert(regions.size() == 3);
+  RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(m, output_grad, input_grad, weight);
+}
 
 void RMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);

From f72067a4561a769960952c980b9deb1d46684fa6 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 11 Oct 2023 12:15:47 -0400
Subject: [PATCH 017/198] add LoraLinearConfig

---
 include/flexflow/ffconst.h                |  9 +++-
 include/flexflow/model.h                  |  5 +-
 include/flexflow/ops/lora_linear.h        |  3 +-
 include/flexflow/ops/lora_linear_params.h | 17 +++++++
 inference/file_loader.cc                  |  2 +-
 inference/incr_decoding/incr_decoding.cc  |  6 +--
 inference/models/llama.cc                 |  2 +-
 src/ops/fused.cu                          |  3 +-
 src/ops/lora_linear.cc                    | 39 +++++++++++-----
 src/ops/lora_linear_params.cc             | 20 ++++++++
 src/runtime/ffconst_utils.cc              |  6 ++-
 src/runtime/graph.cc                      |  3 +-
 src/runtime/model.cc                      |  3 +-
 src/runtime/request_manager.cc            | 57 +++++++++++++++++++----
 14 files changed, 141 insertions(+), 34 deletions(-)
 create mode 100644 src/ops/lora_linear_params.cc

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index efc37ce78d..6fe52e6892 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -46,6 +46,12 @@ enum LossType {
   LOSS_IDENTITY = 54,
 };
 
+enum OptimizerType {
+  OPTIMIZER_TYPE_NONE = 60,
+  OPTIMIZER_TYPE_SGD = 61,
+  OPTIMIZER_TYPE_ADAM = 62,
+};
+
 enum CompMode {
   COMP_MODE_TRAINING = 70,
   COMP_MODE_INFERENCE = 71,
@@ -173,7 +179,8 @@ enum OperatorType {
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
   // PEFT Ops
-  OP_LORA_LINEAR,
+  OP_LORA_MLP_FIRST,
+  OP_LORA_MLP_SECOND,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 8e0a264e8f..8d6dd87e91 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -813,6 +813,7 @@ class FFModel {
   // ========================================
   void lora_linear(Tensor const input,
                    Tensor const output,
+                   OperatorType _type,
                    char const *name = nullptr);
   // ========================================
   // Inference APIs
@@ -821,7 +822,9 @@ class FFModel {
                             int max_seq_length,
                             PEFTModelID peft_model_id = PEFTModelID::NO_ID);
 
-  PEFTModelID register_peft_model(std::map<std::string, int> config);
+  PEFTModelID register_peft_model(
+      LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig,
+      LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 23dc8ec496..b9aabdd1aa 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -19,6 +19,7 @@ class LoraLinear : public Op {
 
   LoraLinear(FFModel &model,
              LayerID const &layer_guid,
+             OperatorType type,
              ParallelTensor const input,
              ParallelTensor const output,
              char const *name = nullptr);
@@ -42,7 +43,7 @@ class LoraLinear : public Op {
                            std::vector<ParallelTensor> const &batch_inputs,
                            std::vector<ParallelTensor> const &batch_outputs,
                            PEFTModelID const &model_id,
-                           int rank);
+                           LoraLinearConfig const lora_config);
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 9eaee3000b..46ee4ac6b7 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -9,9 +9,26 @@
 
 namespace FlexFlow {
 
+class LoraLinearConfig {
+public:
+  static const LoraLinearConfig DefaultConfig;
+  LoraLinearConfig();
+  LoraLinearConfig(int rank,
+                   OptimizerType type = OPTIMIZER_TYPE_SGD,
+                   float learning_rate = 1e-4);
+  friend bool operator==(LoraLinearConfig const &lhs,
+                         LoraLinearConfig const &rhs);
+
+public:
+  int rank;
+  OptimizerType optimizer_type;
+  float learning_rate;
+};
+
 class LoraLinearParams {
 public:
   LayerID layer_guid;
+  OperatorType type;
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
                     &input_shape) const;
diff --git a/inference/file_loader.cc b/inference/file_loader.cc
index f11df920e3..20c14f8f4f 100644
--- a/inference/file_loader.cc
+++ b/inference/file_loader.cc
@@ -765,7 +765,7 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) {
         continue;
       }
       // TODO: currently skip Lora layers
-      if (l->op_type == OP_LORA_LINEAR) {
+      if (l->op_type == OP_LORA_MLP_FIRST || l->op_type == OP_LORA_MLP_SECOND) {
         continue;
       }
       switch (weight->data_type) {
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 277d86c9cc..461d71b23a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -258,9 +258,9 @@ void FlexFlow::top_level_task(Task const *task,
   }
 
   // Register PEFT layer
-  std::map<std::string, int> peft_config;
-  peft_config["lora_mlp_linear_second"] = 4;
-  PEFTModelID peft_model_id = model.register_peft_model(peft_config);
+  LoraLinearConfig mlp_second(4 /*rank*/);
+  PEFTModelID peft_model_id = model.register_peft_model(
+      LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/);
 
   int total_num_requests = 0;
   {
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 2fe5642507..20e1f38ce9 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -220,7 +220,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(multi, w2, "lora_mlp_linear_second");
+    ff.lora_linear(multi, w2, OP_LORA_MLP_SECOND);
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 9aa4291453..f404e305e6 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -702,7 +702,8 @@ __host__ void
                                                 batch_size);
         break;
       }
-      case OP_LORA_LINEAR: {
+      case OP_LORA_MLP_FIRST:
+      case OP_LORA_MLP_SECOND: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
         Domain input_domain = my_input_accessor[0].domain;
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index be1015e065..2e356f7531 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -33,11 +33,12 @@ using namespace FlexFlow::Kernels::LoraLinear;
 
 void FFModel::lora_linear(Tensor const input,
                           Tensor const output,
+                          OperatorType op_type,
                           char const *name) {
   assert(input->data_type == output->data_type);
   Layer *lora = nullptr;
   lora = new Layer(this,
-                   OP_LORA_LINEAR,
+                   op_type,
                    output->data_type,
                    name,
                    2 /*inputs*/,
@@ -61,29 +62,40 @@ Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
-  return new LoraLinear(
-      model, layer->layer_guid, inputs[0], inputs[1], layer->name);
+  return new LoraLinear(model,
+                        layer->layer_guid,
+                        layer->op_type,
+                        inputs[0],
+                        inputs[1],
+                        layer->name);
 }
 
 LoraLinear::LoraLinear(FFModel &model,
                        LoraLinear const &other,
                        ParallelTensor const input,
                        ParallelTensor const output)
-    : LoraLinear(model, other.layer_guid, input, output, other.name) {}
+    : LoraLinear(
+          model, other.layer_guid, other.op_type, input, output, other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        Params const &params,
                        Input const &inputs,
                        char const *name)
-    : LoraLinear(model, params.layer_guid, inputs.first, inputs.second, name) {}
+    : LoraLinear(model,
+                 params.layer_guid,
+                 params.type,
+                 inputs.first,
+                 inputs.second,
+                 name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        LayerID const &_layer_guid,
+                       OperatorType _op_type,
                        ParallelTensor const _input,
                        ParallelTensor const _output,
                        char const *name)
     : Op(model,
-         OP_LORA_LINEAR,
+         _op_type,
          _output->data_type,
          name,
          2 /*inputs*/,
@@ -205,7 +217,7 @@ OpMeta *LoraLinear::init_task(Task const *task,
 struct LoraLinearRegisterInfo {
   LoraLinear const *lora;
   PEFTModelID model_id;
-  int rank;
+  LoraLinearConfig lora_config;
 };
 
 void LoraLinear::register_peft_model(
@@ -213,7 +225,7 @@ void LoraLinear::register_peft_model(
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     PEFTModelID const &model_id,
-    int rank) {
+    LoraLinearConfig const lora_config) {
   assert(check_output_input_weight_same_parallel_is());
   assert(batch_inputs.size() == 2);
   assert(batch_outputs.size() == 1);
@@ -234,7 +246,7 @@ void LoraLinear::register_peft_model(
   LoraLinearRegisterInfo info;
   info.lora = this;
   info.model_id = model_id;
-  info.rank = rank;
+  info.lora_config = lora_config;
   IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID,
                          parallel_is,
                          TaskArgument(&info, sizeof(LoraLinearRegisterInfo)),
@@ -255,7 +267,7 @@ void LoraLinear::register_model_task(Task const *task,
       static_cast<LoraLinearRegisterInfo const *>(task->args);
   LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
   LoraLinear const *lora = info->lora;
-  int rank = info->rank;
+  int rank = info->lora_config.rank;
   int num_dims = lora->inputs[0]->num_dims;
   int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
   int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
@@ -463,13 +475,14 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid;
+  return lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type;
 }
 
 void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->op_type);
 }
 
 /* static */
@@ -480,13 +493,16 @@ Node LoraLinear::deserialize(FFModel &ff,
                              int num_inputs) {
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
+  OperatorType op_type;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
+  dez.deserialize(op_type);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
   LoraLinearParams params;
   params.layer_guid = layer_guid;
+  params.type = op_type;
   return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
 }
 
@@ -500,6 +516,7 @@ Op *LoraLinear::materialize(FFModel &ff,
 LoraLinearParams LoraLinear::get_params() const {
   LoraLinearParams params;
   params.layer_guid = this->layer_guid;
+  params.type = this->op_type;
   return params;
 }
 
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
new file mode 100644
index 0000000000..80e7c6d64e
--- /dev/null
+++ b/src/ops/lora_linear_params.cc
@@ -0,0 +1,20 @@
+#include "flexflow/ops/lora_linear_params.h"
+
+namespace FlexFlow {
+const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig();
+
+LoraLinearConfig::LoraLinearConfig()
+    : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f) {}
+
+LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr)
+    : rank(_rank), optimizer_type(_type), learning_rate(_lr) {}
+
+bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
+  if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type &&
+      lhs.learning_rate == rhs.learning_rate) {
+    return true;
+  }
+  return false;
+}
+
+}; // namespace FlexFlow
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 47abcacd6a..3ee1ee62df 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -189,8 +189,10 @@ std::string get_operator_type_name(OperatorType type) {
     case OP_ARGMAX:
       return "ArgMax";
     // PEFT Ops
-    case OP_LORA_LINEAR:
-      return "LoraLinear";
+    case OP_LORA_MLP_FIRST:
+      return "Lora MLP First Layer";
+    case OP_LORA_MLP_SECOND:
+      return "Lora MLP Second Layer";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 5ca09db84b..b58990d32e 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2730,7 +2730,8 @@ void FFModel::deserialize_graph_optimal_view(
         node = Linear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
-      case OP_LORA_LINEAR: {
+      case OP_LORA_MLP_FIRST:
+      case OP_LORA_MLP_SECOND: {
         node = LoraLinear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 931173e5f3..2bc1f30d07 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3228,7 +3228,8 @@ Op *FFModel::create_operator_from_layer(
       return op;
     }
     // PEFT layers
-    case OP_LORA_LINEAR: {
+    case OP_LORA_MLP_FIRST:
+    case OP_LORA_MLP_SECOND: {
       Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
       return op;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 05eb3bb554..1f311b3b56 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1829,17 +1829,28 @@ std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
   return "invalid_layer_name";
 }
 
-PEFTModelID FFModel::register_peft_model(std::map<std::string, int> configs) {
+bool is_peft_operator_type(OperatorType type) {
+  switch (type) {
+    case OP_LORA_MLP_FIRST:
+    case OP_LORA_MLP_SECOND:
+      return true;
+    default:
+      return false;
+  }
+}
+
+PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
+                                         LoraLinearConfig const mlp_second) {
   PEFTModelID peft_model_id(peft_model_global_guid++);
   InferenceManager *im = InferenceManager::get_inference_manager();
   std::vector<Op *> peft_operators;
   for (size_t op = 0; op < operators.size(); op++) {
-    if (operators[op]->op_type == OP_LORA_LINEAR) {
+    if (is_peft_operator_type(operators[op]->op_type)) {
       peft_operators.push_back(operators[op]);
     } else if (operators[op]->op_type == OP_FUSED) {
       FusedOp *fused = static_cast<FusedOp *>(operators[op]);
       for (size_t op2 = 0; op2 < fused->numOperators; op2++) {
-        if (fused->operators[op2]->op_type == OP_LORA_LINEAR) {
+        if (is_peft_operator_type(fused->operators[op2]->op_type)) {
           peft_operators.push_back(fused->operators[op2]);
         }
       }
@@ -1849,12 +1860,37 @@ PEFTModelID FFModel::register_peft_model(std::map<std::string, int> configs) {
     std::string layer_name =
         find_layer_name_from_guid(this, peft_operators[op]->layer_guid);
     switch (peft_operators[op]->op_type) {
-      case OP_LORA_LINEAR: {
-        // Remove the guid and the ``_'' char from opname: guid has 7 digits
-        // and ``_'' occupies 1 char
-        layer_name = layer_name.erase(layer_name.length() - 8);
-        assert(configs.find(layer_name) != configs.end());
-        int rank = configs[layer_name];
+      case OP_LORA_MLP_FIRST: {
+        if (mlp_first == LoraLinearConfig::DefaultConfig) {
+          // Do nothing for the default configuration
+          continue;
+        }
+        LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
+        // Currently assume only a single data pipeline
+        assert(config.data_parallelism_degree == 1);
+        std::vector<ParallelTensor> inputs(lora->numInputs);
+        std::vector<ParallelTensor> outputs(lora->numOutputs);
+
+        for (int i = 0; i < lora->numInputs; i++) {
+          assert(im->tensor_buffer.find(lora->inputs[i]) !=
+                 im->tensor_buffer.end());
+          assert(lora->inputs[i] != nullptr);
+          assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+          assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
+          inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
+          assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+        }
+        assert(lora->numOutputs == 1);
+        outputs[0] = inputs[1];
+        lora->register_peft_model(
+            *this, inputs, outputs, peft_model_id, mlp_first);
+        break;
+      }
+      case OP_LORA_MLP_SECOND: {
+        if (mlp_second == LoraLinearConfig::DefaultConfig) {
+          // Do nothing for the default configuration
+          continue;
+        }
         LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
         // Currently assume only a single data pipeline
         assert(config.data_parallelism_degree == 1);
@@ -1872,7 +1908,8 @@ PEFTModelID FFModel::register_peft_model(std::map<std::string, int> configs) {
         }
         assert(lora->numOutputs == 1);
         outputs[0] = inputs[1];
-        lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank);
+        lora->register_peft_model(
+            *this, inputs, outputs, peft_model_id, mlp_second);
         break;
       }
       default: {

From 49e5664cade618ba7b93c73466efb5a2974ebc0e Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 11 Oct 2023 13:01:19 -0400
Subject: [PATCH 018/198] add an API for register peft request

---
 include/flexflow/request_manager.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index da64ac58a2..47627bc9fb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -126,6 +126,10 @@ class RequestManager {
   RequestGuid register_new_request(std::vector<TokenId> const &prompt,
                                    int max_sequence_length,
                                    PEFTModelID peft_model_id);
+  RequestGuid register_new_peft_request(
+      std::vector<std::pair<std::string, std::string>> const &dataset,
+      int max_sequence_length,
+      PEFTModelID peft_model_id);
   bool is_request_completed(RequestGuid const &guid);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);

From 008ffd9a180d3b82e6a0befdfd8f5d53202e766d Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sun, 15 Oct 2023 16:19:39 -0400
Subject: [PATCH 019/198] format

---
 src/ops/tree_inc_multihead_self_attention.cpp | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 05513ea2cc..e5bec2bc07 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -212,7 +212,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_infr_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
             m->hidden_size);
       }
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index f63f59eae2..a6c4988ac8 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -211,7 +211,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_infr_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
             m->hidden_size);
       }

From ace7e3ff6f27a286d554a7c94c24d14d6d6525b8 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 17 Oct 2023 00:22:33 -0400
Subject: [PATCH 020/198] .

---
 config/config.linux                           |    2 +-
 include/flexflow/model.h                      |    1 +
 include/flexflow/ops/fused.h                  |    9 +
 .../flexflow/ops/kernels/softmax_kernels.h    |   27 +-
 include/flexflow/ops/layer_norm.h             |   10 +-
 include/flexflow/ops/softmax.h                |    9 +
 include/flexflow/request_manager.h            |    1 +
 inference/models/llama.cc                     |    3 +-
 src/ops/fused.cc                              |   61 +
 src/ops/fused.cu                              | 1339 ++++++++++++-----
 src/ops/kernels/softmax.cu                    |  188 ++-
 src/ops/layer_norm.cc                         |   12 +-
 src/ops/layer_norm.cu                         |   33 +-
 src/ops/softmax.cc                            |   67 +-
 src/runtime/inference_manager.cc              |   50 +
 src/runtime/model.cc                          |   32 +-
 src/runtime/request_manager.cc                |    1 +
 17 files changed, 1311 insertions(+), 534 deletions(-)

diff --git a/config/config.linux b/config/config.linux
index 3686237538..dbf3d3dd01 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -13,7 +13,7 @@
 #INSTALL_DIR=
 
 # set build type
-BUILD_TYPE=${BUILD_TYPE:-Release}
+BUILD_TYPE=${BUILD_TYPE:-Debug}
 
 INFERENCE_TESTS=${INFERENCE_TESTS:-OFF}
 LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"}
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 8d6dd87e91..faf969efb7 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -182,6 +182,7 @@ enum TaskIDs {
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   MSELOSS_BWD_TASK_ID,
   FUSEDOP_INIT_TASK_ID,
+  FUSEDOP_PEFT_BWD_TASK_ID,
   FUSEDOP_FWD_TASK_ID,
   FUSEDOP_BWD_TASK_ID,
   FUSEDOP_INF_TASK_ID,
diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h
index 87c2201c28..ffafa97915 100644
--- a/include/flexflow/ops/fused.h
+++ b/include/flexflow/ops/fused.h
@@ -40,6 +40,11 @@ class FusedOp : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -51,6 +56,10 @@ class FusedOp : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index 8cfaf3c586..339d8ebc53 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -28,16 +28,24 @@ class SoftmaxMeta : public OpMeta {
 
 namespace Kernels {
 namespace Softmax {
-template <typename DT>
+
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr);
-template <typename DT>
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output);
+
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad,
                              size_t num_elements);
 
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
 namespace Internal {
 template <typename DT>
 void forward_kernel(SoftmaxMeta const *m,
@@ -50,6 +58,13 @@ void backward_kernel(DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      ffStream_t stream);
+
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      ffStream_t stream);
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h
index 389b3e718a..d5ab51bbf8 100644
--- a/include/flexflow/ops/layer_norm.h
+++ b/include/flexflow/ops/layer_norm.h
@@ -118,17 +118,17 @@ class LayerNorm : public Op {
                                       T const *gamma_ptr,
                                       T *gamma_grad_ptr,
                                       T *beta_grad_ptr);
+
+  static void peft_bwd_kernel_wrapper(LayerNormMeta const *m,
+                                      GenericTensorAccessorW const &output_grad,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &gamma);
   template <typename T>
   static void peft_bwd_kernel(LayerNormMeta const *m,
                               T const *output_grad_ptr,
                               T *input_grad_ptr,
                               T const *gamma_ptr,
                               ffStream_t stream);
-  template <typename T>
-  static void peft_bwd_kernel_wrapper(LayerNormMeta const *m,
-                                      T const *output_grad_ptr,
-                                      T *input_grad_ptr,
-                                      T const *gamma_ptr);
 
 public:
   bool elementwise_affine, use_bias;
diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h
index 6fd1a434d4..5e94c5626c 100644
--- a/include/flexflow/ops/softmax.h
+++ b/include/flexflow/ops/softmax.h
@@ -32,6 +32,11 @@ class Softmax : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   void print_layer(FFModel const &model) override {
@@ -57,6 +62,10 @@ class Softmax : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 47627bc9fb..5aab9781c8 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -37,6 +37,7 @@ class InferenceManager {
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
       inference(FFModel *model, int index, BatchConfigFuture const &bc);
+  void peft_bwd(FFModel *model, int index, BatchConfigFuture const &bc);
   void load_input_tokens_from_batch_config(BatchConfigFuture const &bc,
                                            ParallelTensor const input);
   void load_positions(BatchConfigFuture const &bc,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 20e1f38ce9..72641161d1 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -257,7 +257,8 @@ void LLAMA::create_llama_model(FFModel &ff,
       output = ff.sampling(softmax, generation_config.topp);
     } else {
       // output = ff.arg_top_k(dense, /*k=*/1, false);
-      output = ff.argmax(dense, /*beam_Search*/ false);
+      Tensor softmax = ff.softmax(dense, -1);
+      output = ff.argmax(softmax, /*beam_Search*/ false);
     }
   }
 
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 70650aef0d..8964f0063d 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -531,6 +531,67 @@ FutureMap FusedOp::inference(FFModel const &ff,
   return runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap FusedOp::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
+  // Set iter_config
+  iter_config = ff.iter_config;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig
+  // so we transfer the maximum of them
+  // size_t batch_config_size =
+  //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
+  IndexLauncher launcher(FUSEDOP_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int offset = 0;
+  for (int i = 0; i < numInputs; i++) {
+    assert(inputs[i]->part != LogicalPartition::NO_PART);
+    assert(inputs[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[i]->region));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  offset += numInputs;
+  for (int i = 0; i < numWeights; i++) {
+    assert(weights[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(RegionRequirement(weights[i]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[i]->region));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  offset += numWeights;
+  for (int i = 0; i < numOutputs; i++) {
+    assert(outputs[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[i]->part,
+                          0 /*projection id*/,
+                          READ_WRITE,
+                          EXCLUSIVE,
+                          batch_outputs[i]->region));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void FusedOp::backward(FFModel const &ff) {
   // Set iter_config
   iter_config = ff.iter_config;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index f404e305e6..64fe331400 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -78,13 +78,21 @@ OpMeta *FusedOp::init_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void FusedOp::forward_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
+__host__ void
+    FusedOp::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  // BatchConfig const *bc = (BatchConfig *)task->args;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active tokens
+  if (bc->num_tokens == 0) {
+    return;
+  }
+
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
   assert((int)regions.size() ==
@@ -174,10 +182,11 @@ __host__ void FusedOp::forward_task(Task const *task,
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
       // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-      my_output_accessor[i] = output_accessor[i + ooff];
+      my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -192,21 +201,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                                 m->legion_axis);
         break;
       }
-      case OP_CONV2D: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
-        Kernels::Conv2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            my_weight_accessor[1].get_float_ptr());
-        break;
-      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -222,16 +216,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
-      case OP_DROPOUT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
-        Kernels::Dropout::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
-        break;
-      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -242,25 +226,49 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        float const *bias_ptr = nullptr;
+        void const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].get_float_ptr();
+            bias_ptr = my_weight_accessor[1].ptr;
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        Kernels::Linear::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            bias_ptr,
-            in_dim,
-            out_dim,
-            batch_size);
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_accessor[0].data_type);
+        batch_size = bc->num_active_infr_tokens();
+        Kernels::Linear::forward_kernel_wrapper(m,
+                                                my_input_accessor[0].ptr,
+                                                my_output_accessor[0].ptr,
+                                                my_weight_accessor[0].ptr,
+                                                bias_ptr,
+                                                in_dim,
+                                                out_dim,
+                                                batch_size);
+        break;
+      }
+      case OP_LORA_MLP_FIRST:
+      case OP_LORA_MLP_SECOND: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_accessor[0].domain;
+        Domain output_domain = my_output_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
+        Kernels::LoraLinear::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -388,88 +396,126 @@ __host__ void FusedOp::forward_task(Task const *task,
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU: {
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        ElementUnary::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        if (m->data_type == DT_HALF) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_FLOAT) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else {
+          assert(false && "Unsupported data type in ElementUnary forward");
+        }
         break;
       }
-      case OP_POOL2D: {
+      case OP_RMS_NORM: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        // assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
-        Kernels::Pool2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
+        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::forward_kernel_wrapper(m,
+                                                 my_input_accessor[0],
+                                                 my_weight_accessor[0],
+                                                 my_output_accessor[0]);
         break;
       }
-      case OP_FLAT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Flat::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
+                                                         my_input_accessor[0],
+                                                         my_input_accessor[1],
+                                                         my_weight_accessor[0],
+                                                         my_output_accessor[0],
+                                                         my_output_accessor[1]);
         break;
       }
-      case OP_SOFTMAX: {
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
+        IncMultiHeadSelfAttentionMeta const *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
         }
+        IncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_RESHAPE: {
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Reshape::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        TreeIncMultiHeadSelfAttentionMeta *m =
+            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // TreeVerifyBatchConfig const *tree_bc =
+        //     (TreeVerifyBatchConfig *)task->args;
+        TreeVerifyBatchConfig const &tree_bc =
+            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            &tree_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_TRANSPOSE: {
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
-        Kernels::Transpose::forward_kernel_wrapper(
+        SpecIncMultiHeadSelfAttentionMeta const *m =
+            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // BeamSearchBatchConfig const *beam_bc =
+        //     (BeamSearchBatchConfig *)task->args;
+        BeamSearchBatchConfig const &beam_bc =
+            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain,
-            my_output_accessor[0].domain);
+            &beam_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
       case OP_LAYERNORM: {
@@ -491,23 +537,119 @@ __host__ void FusedOp::forward_task(Task const *task,
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator ResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator AddBiasResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_SIGMOID_SILU_MULTI: {
-        assert(false && "Operator SigmoidSiluMulti does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(false && "Operator ResidualRMSNorm does not support "
-                        "the forward() task");
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta const *m =
+            (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorR residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_accessor[2];
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                    my_input_accessor[0],
+                                                    my_input_accessor[1],
+                                                    residual2,
+                                                    my_output_accessor[0],
+                                                    my_output_accessor[1],
+                                                    gamma,
+                                                    beta);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta const *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+          if (m->use_bias) {
+            beta = my_weight_accessor[2];
+          }
+        }
+        Domain attn_bias_domain = my_weight_accessor[0].domain;
+        Domain residual_domain = my_input_accessor[1].domain;
+        int attn_bias_dim =
+            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
+        int residual_volume = residual_domain.get_volume();
+        AddBiasResidualLayerNorm::inference_kernel_wrapper(
+            m,
+            attn_bias_dim,
+            residual_volume,
+            my_input_accessor[0],
+            my_output_accessor[0],
+            my_output_accessor[1],
+            my_input_accessor[1],
+            my_weight_accessor[0],
+            gamma,
+            beta);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                   my_input_accessor[0],
+                                                   my_input_accessor[1],
+                                                   my_output_accessor[0]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        if (m->input_type == DT_HALF) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr());
+        } else if (m->input_type == DT_FLOAT) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr());
+        }
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       default: {
@@ -517,6 +659,37 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(false && "Fusion currently does not support type");
       }
     }
+    if (metas->meta[op]->inference_debugging) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorW> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        int my_off = fused->op_input_idx[i + ioff];
+        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+          input_accessors_to_save.push_back(input_accessor[my_off]);
+        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+          input_accessors_to_save.push_back(output_accessor[my_off]);
+        } else {
+          assert(false);
+        }
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+        weight_accessors_to_save.push_back(
+            weight_accessor[fused->op_weight_idx[i + woff]]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(output_accessor[i + ooff]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save);
+    }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
@@ -531,18 +704,17 @@ __host__ void FusedOp::forward_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void
-    FusedOp::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
+__host__ void FusedOp::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
   // BatchConfig const *bc = (BatchConfig *)task->args;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   // Return if no active tokens
-  if (bc->num_tokens == 0) {
+  if (bc->num_active_tokens() == 0) {
     return;
   }
 
@@ -553,15 +725,15 @@ __host__ void
   // Domain input_domain[MAX_NUM_INPUTS];
   // Domain weight_domain[MAX_NUM_WEIGHTS];
   // Domain output_domain[MAX_NUM_OUTPUTS];
-  GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
-  GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
     // input_domain[i] = runtime->get_index_space_domain(
     //     ctx, task->regions[i].region.get_index_space());
-    input_accessor[i] =
-        helperGetGenericTensorAccessorRO(fused->input_data_types[i],
+    input_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->input_data_types[i],
                                          regions[i],
                                          task->regions[i],
                                          FID_DATA,
@@ -586,8 +758,8 @@ __host__ void
   for (int i = 0; i < fused->numOutputs; i++) {
     // output_domain[i] = runtime->get_index_space_domain(
     //     ctx, task->regions[i + roff].region.get_index_space());
-    output_accessor[i] =
-        helperGetGenericTensorAccessorWO(fused->output_data_types[i],
+    output_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->output_data_types[i],
                                          regions[i + roff],
                                          task->regions[i + roff],
                                          FID_DATA,
@@ -609,21 +781,32 @@ __host__ void
   }
 
   int ioff = 0, woff = 0, ooff = 0;
+  // Domain my_id[MAX_NUM_INPUTS];
+  // Domain my_wd[MAX_NUM_WEIGHTS];
+  // Domain my_od[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
+
+  // Do backpropagation in the reverse ordering
   for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
-    GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-    GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-    GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
+    ioff += fused->op_num_inputs[op];
+    woff += fused->op_num_weights[op];
+    ooff += fused->op_num_outputs[op];
+  }
+
+  for (int op = fused->numOperators - 1; op >= 0; op--) {
+    ioff -= fused->op_num_inputs[op];
+    woff -= fused->op_num_weights[op];
+    ooff -= fused->op_num_outputs[op];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
         // my_id[i] = input_domain[my_off];
-        my_input_accessor[i] = input_accessor[my_off];
+        my_input_grad_accessor[i] = input_grad_accessor[my_off];
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
         // my_id[i] = output_domain[my_off];
-        my_input_accessor[i] = output_accessor[my_off];
+        my_input_grad_accessor[i] = output_grad_accessor[my_off];
       } else {
         assert(false);
       }
@@ -639,7 +822,7 @@ __host__ void
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
       // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-      my_output_accessor[i] = output_accessor[my_off];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -647,26 +830,31 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         ConcatMeta *m = (ConcatMeta *)metas->meta[op];
         int num_inputs = fused->op_num_inputs[op];
-        Kernels::Concat::forward_kernel_wrapper(m,
-                                                my_output_accessor[0],
-                                                my_input_accessor,
-                                                num_inputs,
-                                                m->legion_axis);
+        // TODO: implement this
+        assert(false);
+        // Kernels::Concat::peft_bwd_kernel_wrapper(m,
+        //                                          my_output_accessor[0],
+        //                                          my_input_accessor,
+        //                                         num_inputs,
+        //                                          m->legion_axis);
         break;
       }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
+        assert(my_input_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_output_grad_accessor[0].domain.get_dim() == 5);
         assert(my_weight_accessor[0].domain.get_dim() == 2);
         assert(my_weight_accessor[1].domain.get_dim() == 2);
-        BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
-        BatchNorm::forward_kernel(m,
-                                  my_input_accessor[0].get_float_ptr(),
-                                  my_output_accessor[0].get_float_ptr(),
-                                  my_weight_accessor[0].get_float_ptr(),
-                                  my_weight_accessor[1].get_float_ptr());
+        // TODO: implement this
+        assert(false);
+        // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+        // BatchNorm::peft_bwd_kernel_kernel(
+        //     m,
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[1].get_float_ptr());
         break;
       }
       case OP_LINEAR: {
@@ -675,10 +863,11 @@ __host__ void
         Domain kernel_domain = my_weight_accessor[0].domain;
         int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
         int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
-        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
-        assert(my_output_accessor[0].domain.get_volume() ==
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
-        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
         void const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
@@ -689,48 +878,50 @@ __host__ void
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_infr_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_grad_accessor[0].data_type);
+        int num_infr_tokens = bc->num_active_infr_tokens();
+        int num_peft_tokens = bc->num_active_peft_tokens();
+        Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 my_input_grad_accessor[0].ptr,
+                                                 my_output_grad_accessor[0].ptr,
+                                                 my_weight_accessor[0].ptr,
+                                                 in_dim,
+                                                 out_dim,
+                                                 num_infr_tokens,
+                                                 num_peft_tokens);
         break;
       }
       case OP_LORA_MLP_FIRST:
       case OP_LORA_MLP_SECOND: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
-        Domain input_domain = my_input_accessor[0].domain;
-        Domain output_domain = my_output_accessor[0].domain;
+        Domain input_domain = my_input_grad_accessor[0].domain;
+        Domain output_domain = my_output_grad_accessor[0].domain;
         int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
         int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
-        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
-        assert(my_output_accessor[0].domain.get_volume() ==
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
-        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
         LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_grad_accessor[0].data_type);
         // Assert that the output and the second input are at the same place
         // since we ``inplace'' the output for LoRA
-        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
-        Kernels::LoraLinear::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        Kernels::LoraLinear::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        Domain out_domain = my_output_accessor[0].domain;
-        Domain a_domain = my_input_accessor[0].domain;
-        Domain b_domain = my_input_accessor[1].domain;
+        Domain out_domain = my_output_grad_accessor[0].domain;
+        Domain a_domain = my_input_grad_accessor[0].domain;
+        Domain b_domain = my_input_grad_accessor[1].domain;
         int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
         assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
         int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
@@ -746,20 +937,22 @@ __host__ void
           assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
           batch *= dim_size;
         }
-        BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
-        Kernels::BatchMatmul::forward_kernel_wrapper(
-            meta,
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].get_float_ptr(),
-            my_input_accessor[1].get_float_ptr(),
-            (float const *)nullptr,
-            m,
-            n,
-            k,
-            batch,
-            meta->a_seq_length_dim,
-            meta->b_seq_length_dim,
-            fused->iter_config.seq_length);
+        // TODO: implement me
+        assert(false);
+        // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+        // Kernels::BatchMatmul::backward_kernel_wrapper(
+        //     meta,
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_input_accessor[1].get_float_ptr(),
+        //     (float const *)nullptr,
+        //     m,
+        //     n,
+        //     k,
+        //     batch,
+        //     meta->a_seq_length_dim,
+        //     meta->b_seq_length_dim,
+        //     fused->iter_config.seq_length);
         break;
       }
       case OP_EW_ADD:
@@ -771,78 +964,20 @@ __host__ void
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
-        Kernels::ElementBinary::forward_kernel_wrapper(m,
-                                                       my_input_accessor[0],
-                                                       my_input_accessor[1],
-                                                       my_output_accessor[0]);
+        assert(my_input_grad_accessor[0].domain ==
+               my_input_grad_accessor[1].domain);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+        // Kernels::ElementBinary::forward_kernel_wrapper(m,
+        //                                                my_input_accessor[0],
+        //                                                my_input_accessor[1],
+        //                                                my_output_accessor[0]);
         break;
       }
       case OP_EMBEDDING: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
-        if (m->aggr == AGGR_MODE_NONE) {
-          // assert(kernel_domain.get_dim() == 2);
-          assert(my_input_accessor[0].domain.get_dim() + 1 ==
-                 my_output_accessor[0].domain.get_dim());
-          for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
-            assert(my_input_accessor[0].domain.hi()[i] ==
-                   my_output_accessor[0].domain.hi()[i + 1]);
-            assert(my_input_accessor[0].domain.lo()[i] ==
-                   my_output_accessor[0].domain.lo()[i + 1]);
-          }
-          assert(my_weight_accessor[0].domain.hi()[0] -
-                     my_weight_accessor[0].domain.lo()[0] ==
-                 my_output_accessor[0].domain.hi()[0] -
-                     my_output_accessor[0].domain.lo()[0]);
-        } else {
-          assert(my_input_accessor[0].domain.get_dim() ==
-                 my_output_accessor[0].domain.get_dim());
-          for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
-            assert(my_input_accessor[0].domain.hi()[i] ==
-                   my_output_accessor[0].domain.hi()[i]);
-            assert(my_input_accessor[0].domain.lo()[i] ==
-                   my_output_accessor[0].domain.lo()[i]);
-          }
-          assert(my_weight_accessor[0].domain.hi()[0] -
-                     my_weight_accessor[0].domain.lo()[0] ==
-                 my_output_accessor[0].domain.hi()[0] -
-                     my_output_accessor[0].domain.lo()[0]);
-        }
-        int in_dim, out_dim, effective_batch_size;
-        if (m->aggr == AGGR_MODE_NONE) {
-          in_dim = 1;
-          out_dim = my_output_accessor[0].domain.hi()[0] -
-                    my_output_accessor[0].domain.lo()[0] + 1;
-          effective_batch_size =
-              my_output_accessor[0].domain.get_volume() / out_dim;
-          assert(effective_batch_size * in_dim ==
-                 my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
-          in_dim = my_input_accessor[0].domain.hi()[0] -
-                   my_input_accessor[0].domain.lo()[0] + 1;
-          out_dim = my_output_accessor[0].domain.hi()[0] -
-                    my_output_accessor[0].domain.lo()[0] + 1;
-          effective_batch_size =
-              my_output_accessor[0].domain.get_volume() / out_dim;
-          assert(effective_batch_size * in_dim ==
-                 my_input_accessor[0].domain.get_volume());
-        }
-
-        assert(my_input_accessor[0].data_type == DT_INT32 ||
-               my_input_accessor[0].data_type == DT_INT64);
-        Kernels::Embedding::forward_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_output_accessor[0],
-                                                   my_weight_accessor[0],
-                                                   in_dim,
-                                                   out_dim,
-                                                   effective_batch_size);
+        // Currently assume the Embedding layer cannot be finetuned
+        // so we do nothing for embedding
         break;
       }
       case OP_GELU:
@@ -854,23 +989,26 @@ __host__ void
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // TODO: implement me
+        assert(false);
+        // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        //   if (m->data_type == DT_HALF) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_half_ptr(),
+        //         my_output_accessor[0].get_half_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else if (m->data_type == DT_FLOAT) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_float_ptr(),
+        //         my_output_accessor[0].get_float_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else {
+        //     assert(false && "Unsupported data type in ElementUnary forward");
+        //   }
         break;
       }
       case OP_RMS_NORM: {
@@ -878,23 +1016,26 @@ __host__ void
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
+        Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_weight_accessor[0]);
         break;
       }
       case OP_RESIDUAL_RMS_NORM: {
+        // TODO: implement me
+        assert(false);
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
+        // ResidualRMSNormMeta const *m = (ResidualRMSNormMeta
+        // *)metas->meta[op];
+        // Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
+        //                                                  my_input_accessor[0],
+        //                                                  my_input_accessor[1],
+        //                                                  my_weight_accessor[0],
+        //                                                  my_output_accessor[0],
+        //                                                  my_output_accessor[1]);
         break;
       }
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -909,66 +1050,20 @@ __host__ void
           assert(fused->op_num_weights[op] == 2);
           biases = my_weight_accessor[1];
         }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
+        IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
             m,
             bc,
             task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeVerifyBatchConfig const *tree_bc =
-        //     (TreeVerifyBatchConfig *)task->args;
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &tree_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
+            my_input_grad_accessor[0],
             my_weight_accessor[0],
-            my_output_accessor[0],
+            my_output_grad_accessor[0],
             biases);
         break;
       }
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &beam_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        // TODO: implement me
+        assert(false);
         break;
       }
       case OP_LAYERNORM: {
@@ -985,8 +1080,8 @@ __host__ void
             beta = my_weight_accessor[1];
           }
         }
-        LayerNorm::forward_kernel_wrapper(
-            m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
+        LayerNorm::peft_bwd_kernel_wrapper(
+            m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma);
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
@@ -1009,7 +1104,7 @@ __host__ void
         }
         GenericTensorAccessorR residual2;
         if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
+          residual2 = my_input_grad_accessor[2];
         }
         GenericTensorAccessorR gamma, beta;
         if (m->elementwise_affine) {
@@ -1018,14 +1113,16 @@ __host__ void
             beta = my_weight_accessor[1];
           }
         }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
+        // TODO: implment me
+        assert(false);
+        // ResidualLayerNorm::inference_kernel_wrapper(m,
+        //                                             my_input_accessor[0],
+        //                                             my_input_accessor[1],
+        //                                             residual2,
+        //                                             my_output_accessor[0],
+        //                                             my_output_accessor[1],
+        //                                             gamma,
+        //                                             beta);
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
@@ -1050,59 +1147,55 @@ __host__ void
           }
         }
         Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
+        Domain residual_domain = my_input_grad_accessor[1].domain;
         int attn_bias_dim =
             attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
         int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
+        // TODO: implement me
+        assert(false);
+        // AddBiasResidualLayerNorm::inference_kernel_wrapper(
+        //     m,
+        //     attn_bias_dim,
+        //     residual_volume,
+        //     my_input_accessor[0],
+        //     my_output_accessor[0],
+        //     my_output_accessor[1],
+        //     my_input_accessor[1],
+        //     my_weight_accessor[0],
+        //     gamma,
+        //     beta);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
+        // SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta
+        // *)metas->meta[op];
+        //  TODO: implement me
+        assert(false);
+        // SigmoidSiluMulti::inference_kernel_wrapper(m,
+        //                                            my_input_accessor[0],
+        //                                            my_input_accessor[1],
+        //                                            my_output_accessor[0]);
         break;
       }
       case OP_SOFTMAX: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               my_output_grad_accessor[0].domain.get_volume());
         SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
-        }
+        Kernels::Softmax::peft_bwd_kernel_wrapper(
+            m, my_input_grad_accessor[0], my_output_grad_accessor[0]);
         break;
       }
       case OP_ALLREDUCE: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        Kernels::AllReduce::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
         break;
       }
       default: {
@@ -1112,36 +1205,458 @@ __host__ void
         assert(false && "Fusion currently does not support type");
       }
     }
-    if (metas->meta[op]->inference_debugging) {
-      std::vector<GenericTensorAccessorR> input_accessors_to_save;
-      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-      std::vector<GenericTensorAccessorW> output_accessors_to_save;
-      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          input_accessors_to_save.push_back(input_accessor[my_off]);
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          input_accessors_to_save.push_back(output_accessor[my_off]);
-        } else {
-          assert(false);
-        }
-      }
-      for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(
-            weight_accessor[fused->op_weight_idx[i + woff]]);
-      }
-      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        output_accessors_to_save.push_back(output_accessor[i + ooff]);
-      }
-      assert(task->index_point.get_dim() == 1);
-      int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                              shard_id,
-                                              bc,
-                                              input_accessors_to_save,
-                                              weight_accessors_to_save,
-                                              output_accessors_to_save);
+    ioff += fused->op_num_inputs[op];
+    woff += fused->op_num_weights[op];
+    ooff += fused->op_num_outputs[op];
+  }
+  // for (int i = 0; i < fused->numOutputs; i++)
+  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+  //   "[Fused:forward:output]");
+}
+
+/*
+  regions[...](I): inputs
+  regions[...](I): weights
+  regions[...](O): outputs
+*/
+__host__ void FusedOp::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  // const FusedOp* fused = (FusedOp*) task->args;
+  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
+  FusedOp const *fused = metas->fused_op;
+  assert(metas->numOperators == fused->numOperators);
+  assert(regions.size() == task->regions.size());
+  assert((int)regions.size() ==
+         fused->numInputs + fused->numWeights + fused->numOutputs);
+  // Domain input_domain[MAX_NUM_INPUTS];
+  // Domain weight_domain[MAX_NUM_WEIGHTS];
+  // Domain output_domain[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
+  assert(fused->numInputs <= MAX_NUM_INPUTS);
+  for (int i = 0; i < fused->numInputs; i++) {
+    // input_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i].region.get_index_space());
+    input_accessor[i] =
+        helperGetGenericTensorAccessorRO(fused->input_data_types[i],
+                                         regions[i],
+                                         task->regions[i],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  int roff = fused->numInputs;
+  assert(fused->numWeights <= MAX_NUM_WEIGHTS);
+  for (int i = 0; i < fused->numWeights; i++) {
+    // weight_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    weight_accessor[i] =
+        helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  roff += fused->numWeights;
+  assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
+  for (int i = 0; i < fused->numOutputs; i++) {
+    // output_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    output_accessor[i] =
+        helperGetGenericTensorAccessorWO(fused->output_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  // Assert that all meta share the same dnn/blas handler
+  int start = 0;
+  for (start = 0; start < fused->numOperators; start++) {
+    if (metas->meta[start] != NULL) {
+      break;
+    }
+  }
+  for (int op = start + 1; op < fused->numOperators; op++) {
+    if (metas->meta[op] != NULL) {
+      assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
+    }
+  }
+
+  int ioff = 0, woff = 0, ooff = 0;
+  for (int op = 0; op < fused->numOperators; op++) {
+    // Domain my_id[MAX_NUM_INPUTS];
+    // Domain my_wd[MAX_NUM_WEIGHTS];
+    // Domain my_od[MAX_NUM_OUTPUTS];
+    GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
+    GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+    GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
+    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+      int my_off = fused->op_input_idx[i + ioff];
+      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        // my_id[i] = input_domain[my_off];
+        my_input_accessor[i] = input_accessor[my_off];
+      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        // my_id[i] = output_domain[my_off];
+        my_input_accessor[i] = output_accessor[my_off];
+      } else {
+        assert(false);
+      }
+    }
+    for (int i = 0; i < fused->op_num_weights[op]; i++) {
+      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+    }
+    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+      my_output_accessor[i] = output_accessor[i + ooff];
+    }
+    switch (fused->op_op_type[op]) {
+      case OP_CONCAT: {
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+        int num_inputs = fused->op_num_inputs[op];
+        Kernels::Concat::forward_kernel_wrapper(m,
+                                                my_output_accessor[0],
+                                                my_input_accessor,
+                                                num_inputs,
+                                                m->legion_axis);
+        break;
+      }
+      case OP_CONV2D: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 5);
+        assert(my_output_accessor[0].domain.get_dim() == 5);
+        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
+        Kernels::Conv2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            my_weight_accessor[1].get_float_ptr());
+        break;
+      }
+      case OP_BATCHNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_dim() == 5);
+        assert(my_output_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 2);
+        assert(my_weight_accessor[1].domain.get_dim() == 2);
+        BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+        BatchNorm::forward_kernel(m,
+                                  my_input_accessor[0].get_float_ptr(),
+                                  my_output_accessor[0].get_float_ptr(),
+                                  my_weight_accessor[0].get_float_ptr(),
+                                  my_weight_accessor[1].get_float_ptr());
+        break;
+      }
+      case OP_DROPOUT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
+        Kernels::Dropout::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
+        break;
+      }
+      case OP_LINEAR: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain kernel_domain = my_weight_accessor[0].domain;
+        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        float const *bias_ptr = nullptr;
+        LinearMeta *m = (LinearMeta *)metas->meta[op];
+        if (fused->op_num_weights[op] == 2) {
+          assert(my_weight_accessor[1].domain.get_volume() == out_dim);
+          if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
+            bias_ptr = my_weight_accessor[1].get_float_ptr();
+          }
+        } else {
+          assert(fused->op_num_weights[op] == 1);
+        }
+        Kernels::Linear::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            bias_ptr,
+            in_dim,
+            out_dim,
+            batch_size);
+        break;
+      }
+      case OP_BATCHMATMUL: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain out_domain = my_output_accessor[0].domain;
+        Domain a_domain = my_input_accessor[0].domain;
+        Domain b_domain = my_input_accessor[1].domain;
+        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+        assert(a_domain.get_dim() == b_domain.get_dim());
+        assert(a_domain.get_dim() == out_domain.get_dim());
+        int batch = 1;
+        for (int i = 2; i < a_domain.get_dim(); i++) {
+          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+          batch *= dim_size;
+        }
+        BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+        Kernels::BatchMatmul::forward_kernel_wrapper(
+            meta,
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].get_float_ptr(),
+            my_input_accessor[1].get_float_ptr(),
+            (float const *)nullptr,
+            m,
+            n,
+            k,
+            batch,
+            meta->a_seq_length_dim,
+            meta->b_seq_length_dim,
+            fused->iter_config.seq_length);
+        break;
+      }
+      case OP_EW_ADD:
+      case OP_EW_SUB:
+      case OP_EW_MUL:
+      case OP_EW_DIV:
+      case OP_EW_MAX:
+      case OP_EW_MIN: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
+        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+        ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+        Kernels::ElementBinary::forward_kernel_wrapper(m,
+                                                       my_input_accessor[0],
+                                                       my_input_accessor[1],
+                                                       my_output_accessor[0]);
+        break;
+      }
+      case OP_EMBEDDING: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
+        if (m->aggr == AGGR_MODE_NONE) {
+          // assert(kernel_domain.get_dim() == 2);
+          assert(my_input_accessor[0].domain.get_dim() + 1 ==
+                 my_output_accessor[0].domain.get_dim());
+          for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
+            assert(my_input_accessor[0].domain.hi()[i] ==
+                   my_output_accessor[0].domain.hi()[i + 1]);
+            assert(my_input_accessor[0].domain.lo()[i] ==
+                   my_output_accessor[0].domain.lo()[i + 1]);
+          }
+          assert(my_weight_accessor[0].domain.hi()[0] -
+                     my_weight_accessor[0].domain.lo()[0] ==
+                 my_output_accessor[0].domain.hi()[0] -
+                     my_output_accessor[0].domain.lo()[0]);
+        } else {
+          assert(my_input_accessor[0].domain.get_dim() ==
+                 my_output_accessor[0].domain.get_dim());
+          for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
+            assert(my_input_accessor[0].domain.hi()[i] ==
+                   my_output_accessor[0].domain.hi()[i]);
+            assert(my_input_accessor[0].domain.lo()[i] ==
+                   my_output_accessor[0].domain.lo()[i]);
+          }
+          assert(my_weight_accessor[0].domain.hi()[0] -
+                     my_weight_accessor[0].domain.lo()[0] ==
+                 my_output_accessor[0].domain.hi()[0] -
+                     my_output_accessor[0].domain.lo()[0]);
+        }
+        int in_dim, out_dim, effective_batch_size;
+        if (m->aggr == AGGR_MODE_NONE) {
+          in_dim = 1;
+          out_dim = my_output_accessor[0].domain.hi()[0] -
+                    my_output_accessor[0].domain.lo()[0] + 1;
+          effective_batch_size =
+              my_output_accessor[0].domain.get_volume() / out_dim;
+          assert(effective_batch_size * in_dim ==
+                 my_input_accessor[0].domain.get_volume());
+        } else {
+          assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
+          in_dim = my_input_accessor[0].domain.hi()[0] -
+                   my_input_accessor[0].domain.lo()[0] + 1;
+          out_dim = my_output_accessor[0].domain.hi()[0] -
+                    my_output_accessor[0].domain.lo()[0] + 1;
+          effective_batch_size =
+              my_output_accessor[0].domain.get_volume() / out_dim;
+          assert(effective_batch_size * in_dim ==
+                 my_input_accessor[0].domain.get_volume());
+        }
+
+        assert(my_input_accessor[0].data_type == DT_INT32 ||
+               my_input_accessor[0].data_type == DT_INT64);
+        Kernels::Embedding::forward_kernel_wrapper(m,
+                                                   my_input_accessor[0],
+                                                   my_output_accessor[0],
+                                                   my_weight_accessor[0],
+                                                   in_dim,
+                                                   out_dim,
+                                                   effective_batch_size);
+        break;
+      }
+      case OP_GELU:
+      case OP_RELU:
+      case OP_SIGMOID:
+      case OP_TANH:
+      case OP_ELU: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        ElementUnary::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
+        break;
+      }
+      case OP_POOL2D: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        // assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
+        Kernels::Pool2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
+        break;
+      }
+      case OP_FLAT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Flat::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        if (m->input_type == DT_HALF) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr());
+        } else if (m->input_type == DT_FLOAT) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr());
+        }
+        break;
+      }
+      case OP_RESHAPE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Reshape::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
+        break;
+      }
+      case OP_TRANSPOSE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
+        Kernels::Transpose::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain,
+            my_output_accessor[0].domain);
+        break;
+      }
+      case OP_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+        if (m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        LayerNorm::forward_kernel_wrapper(
+            m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
+        break;
+      }
+      case OP_RESIDUAL_LAYERNORM: {
+        assert(false && "Operator ResidualLayerNorm does not support "
+                        "the forward() task");
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(false && "Operator AddBiasResidualLayerNorm does not support "
+                        "the forward() task");
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(false && "Operator SigmoidSiluMulti does not support "
+                        "the forward() task");
+        break;
+      }
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(false && "Operator ResidualRMSNorm does not support "
+                        "the forward() task");
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
     }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index e47006cc9d..69f98d5e5a 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -40,10 +40,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
 namespace Kernels {
 namespace Softmax {
 
-template <typename DT>
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr) {
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -52,7 +51,15 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  Internal::forward_kernel(m, input_ptr, output_ptr, stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::forward_kernel(
+        m, input.get_float_ptr(), output.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::forward_kernel(
+        m, input.get_half_ptr(), output.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -99,21 +106,78 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                            float const *input_ptr,
-                                            float *output_ptr);
-template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                           half const *input_ptr,
-                                           half *output_ptr);
-
-template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                             float *input_grad_ptr,
-                                             float const *output_grad_ptr,
-                                             size_t num_elements);
-template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                            half *input_grad_ptr,
-                                            half const *output_grad_ptr,
-                                            size_t num_elements);
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::inference_kernel(
+        m, bc, input.get_float_ptr(), output.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::inference_kernel(
+        m, bc, input.get_half_ptr(), output.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel(
+        m, bc, input_grad.get_float_ptr(), output_grad.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::inference_kernel(
+        m, bc, input_grad.get_half_ptr(), output_grad.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
 namespace Internal {
 template <typename DT>
 void forward_kernel(SoftmaxMeta const *m,
@@ -146,6 +210,92 @@ void backward_kernel(DT *input_grad_ptr,
                             stream));
 }
 
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_tokens,
+                      int num_classes,
+                      cudaStream_t stream) {
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  float alpha = 1.0f, beta = 0.0f;
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
+                                        CUDNN_TENSOR_NCHW,
+                                        cudnn_data_type,
+                                        num_tokens,
+                                        num_classes,
+                                        1,
+                                        1));
+  checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
+                                 CUDNN_SOFTMAX_ACCURATE,
+                                 CUDNN_SOFTMAX_MODE_CHANNEL,
+                                 &alpha,
+                                 m->outputTensor,
+                                 input_ptr,
+                                 &beta,
+                                 m->outputTensor,
+                                 output_ptr));
+}
+
+template <typename DT>
+__global__ void sparse_categorical_crossentropy_loss_peft_backward(
+    DT *input_grad,
+    DT const *output_grad,
+    BatchConfig::TokenId const *token_ids,
+    int num_tokens,
+    int num_classes) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
+    input_grad[i] = output_grad[i];
+    if (i % num_classes == token_ids[i / num_classes]) {
+      input_grad[i] -= 1.0f;
+    }
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     cudaStream_t stream) {
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    for (int j = 0; j < num_bwd_tokens; j++) {
+      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests].token_id;
+    }
+    checkCUDA(cudaMemcpyAsync(m->handle.workSpace,
+                              token_ids,
+                              sizeof(BatchConfig::TokenID) * num_bwd_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
+    sparse_categorical_crossentropy_loss_peft_backward<<<
+        GET_BLOCKS(num_bwd_tokens * num_classes),
+        CUDA_NUM_THREADS,
+        0,
+        stream>>>(input_grad_ptr + tokens_previous_requests * num_classes,
+                  output_grad_ptr + tokens_previous_requests * num_classes,
+                  token_ids,
+                  num_bwd_tokens,
+                  num_classes);
+
+    tokens_previous_requests += num_bwd_tokens;
+  }
+  assert(tokens_previous_requests == bc->num_active_tokens());
+}
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index b5ee66fdba..b0d196a7c4 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -734,17 +734,7 @@ void LayerNorm::peft_bwd_task(Task const *task,
   } else {
     assert(regions.size() == 2);
   }
-  if (m->output_type[0] == DT_FLOAT) {
-    LayerNorm::peft_bwd_kernel_wrapper<float>(m,
-                                              output_grad.get_float_ptr(),
-                                              input_grad.get_float_ptr(),
-                                              gamma.get_float_ptr());
-  } else {
-    LayerNorm::peft_bwd_kernel_wrapper<half>(m,
-                                             output_grad.get_half_ptr(),
-                                             input_grad.get_half_ptr(),
-                                             gamma.get_half_ptr());
-  }
+  LayerNorm::peft_bwd_kernel_wrapper(m, output_grad, input_grad, gamma);
 }
 
 void LayerNorm::backward(FFModel const &ff) {
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index cdf2ed433f..3d828362dd 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -694,15 +694,26 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
 }
 
 /*static*/
-template <typename T>
-void LayerNorm::peft_bwd_kernel_wrapper(LayerNormMeta const *m,
-                                        T const *output_grad_ptr,
-                                        T *input_grad_ptr,
-                                        T const *gamma_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  LayerNorm::peft_bwd_kernel<T>(
-      m, output_grad_ptr, input_grad_ptr, gamma_ptr, stream);
+void LayerNorm::peft_bwd_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorW const &output_grad,
+    GenericTensorAccessorR const &input_grad,
+    GenericTensorAccessorW const &gamma) cudaStream_t stream;
+checkCUDA(get_legion_stream(&stream));
+if (m->output_type[0] == DT_FLOAT) {
+  LayerNorm::peft_bwd_kernel(m,
+                             output_grad.get_float_ptr(),
+                             input_grad.get_float_ptr(),
+                             gamma.get_float_ptr(),
+                             stream);
+} else {
+  assert(m->output_type[0] == DT_HALF);
+  LayerNorm::peft_bwd_kernel(m,
+                             output_grad.get_half_ptr(),
+                             input_grad.get_half_ptr(),
+                             gamma.get_half_ptr(),
+                             stream);
+}
 }
 
 /*static*/
@@ -753,5 +764,5 @@ template void
                                              half const *output_grad_ptr,
                                              half *input_grad_ptr,
                                              half const *gamma_ptr);
-
-}; // namespace FlexFlow
+}
+; // namespace FlexFlow
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index ba0a1288d6..8d4a1f64b4 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -315,13 +315,7 @@ void Softmax::forward_task(Task const *task,
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  if (m->output_type == DT_HALF) {
-    forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
-  } else if (m->output_type == DT_FLOAT) {
-    forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else {
-    assert(false && "Unsupported data type");
-  }
+  forward_kernel_wrapper(m, input, output);
 }
 
 void Softmax::backward(FFModel const &ff) {
@@ -359,52 +353,11 @@ void Softmax::backward_task(Task const *task,
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
-  switch (in_domain.get_dim()) {
-#define DIMFUNC(DIM)                                                           \
-  case DIM:                                                                    \
-    if (m->output_type == DT_HALF) {                                           \
-      return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
-    } else if (m->output_type == DT_FLOAT) {                                   \
-      return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
-    } else {                                                                   \
-      assert(false && "Unsupported data type");                                \
-    }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      assert(false);
-  }
-}
-
-/*
-  regions[0](I/O): input_grad
-  regions[1](I): output_grad
-*/
-// Note that the backward task of softmax is actually a no op (i.e., input_grad
-// = output_grad) since the upstream cross_entropy_loss function computes
-// performs softmax_cross_entropy_loss to avoid intermediate zeros
-template <typename DT, int NDIM>
-void Softmax::backward_task_with_dim(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // const Softmax* softmax = (Softmax*) task->args;
-  SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
-  TensorAccessorW<DT, NDIM> acc_input_grad(regions[0],
-                                           task->regions[0],
-                                           FID_DATA,
-                                           ctx,
-                                           runtime,
-                                           true /*readOutput*/);
-  TensorAccessorR<DT, NDIM> acc_output_grad(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // make sure the image indices match!
-  assert(acc_input_grad.rect == acc_output_grad.rect);
-
-  backward_kernel_wrapper(
-      m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume());
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorWR(
+      m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(m, input_grad, output_grad);
 }
 
 void Softmax::inference_task(Task const *task,
@@ -425,13 +378,7 @@ void Softmax::inference_task(Task const *task,
       m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  if (m->output_type == DT_HALF) {
-    forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
-  } else if (m->output_type == DT_FLOAT) {
-    forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else {
-    assert(false && "Unsupported data type");
-  }
+  inference_kernel_wrapper(m, input, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 199b94c72c..5d81fa4664 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -369,6 +369,56 @@ FutureMap InferenceManager::inference(FFModel *model,
   return fm;
 };
 
+void InferenceManager::peft_bwd(FFModel *model,
+                                int index,
+                                BatchConfigFuture const &bc) {
+  int batch_index = index % model->config.data_parallelism_degree;
+  FutureMap fm;
+  bool found_input_operator = false;
+  int last_op = model->operators.size() - 1;
+  // Assert that the last operator must be argmax or sampling
+  assert(model->operators[last_op]->op_type == OP_ARGMAX ||
+         model->operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  // Assert that the previous operator must be softmax
+  assert(model->operators[last_op]->op_type == OP_SOFTMAX ||
+         model->operators[last_op]->op_type == OP_FUSED);
+  if (model->operators[last_op]->op_type == OP_FUSED) {
+    FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
+    assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
+  }
+  for (int o = last_op; o >= 0; o--) {
+    Op *op = model->operators[o];
+    if (op->op_type == OP_WEIGHT) {
+      continue;
+    }
+    std::vector<ParallelTensor> inputs(op->numInputs);
+    std::vector<ParallelTensor> outputs(op->numOutputs);
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i] != nullptr);
+      assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      assert(tensor_buffer[op->inputs[i]].size() > batch_index);
+      inputs[i] = tensor_buffer[op->inputs[i]][batch_index];
+      assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    for (int i = 0; i < op->numOutputs; i++) {
+      assert(op->outputs[i] != nullptr);
+      assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      if (op->op_type == OP_INPUT &&
+          tensor_buffer[op->outputs[i]].size() == 0) {
+        continue;
+      }
+      assert(tensor_buffer[op->outputs[i]].size() > batch_index);
+      outputs[i] = tensor_buffer[op->outputs[i]][batch_index];
+      assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    op->peft_bwd(*model, bc, inputs, outputs);
+  }
+};
+
 void InferenceManager::load_input_tokens_from_batch_config(
     BatchConfigFuture const &bc, ParallelTensor const input) {
   Context ctx = ff_config.lg_ctx;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2bc1f30d07..69a7f3786f 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6385,31 +6385,47 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
+    TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<FusedOp::forward_task>(
-          registrar, "FusedOp Forward Task");
+      Runtime::preregister_task_variant<FusedOp::inference_task>(
+          registrar, "FusedOp Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FusedOp::forward_task>(registrar);
+      runtime->register_task_variant<FusedOp::inference_task>(registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
+    TaskVariantRegistrar registrar(FUSEDOP_PEFT_BWD_TASK_ID,
+                                   "FusedOp PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<FusedOp::inference_task>(
-          registrar, "FusedOp Inference Task");
+      Runtime::preregister_task_variant<FusedOp::peft_bwd_task>(
+          registrar, "FusedOp PEFT Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FusedOp::inference_task>(registrar);
+      runtime->register_task_variant<FusedOp::peft_bwd_task>(registrar);
+    }
+  }
+
+  {
+    TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<FusedOp::forward_task>(
+          registrar, "FusedOp Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FusedOp::forward_task>(registrar);
     }
   }
   {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 16f7a44e07..024c8f11ce 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1961,6 +1961,7 @@ GenerationResult
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second);
     FutureMap fm = im->inference(llm, 0, bcf);
+    im->peft_bwd(llm, 0, bcf);
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(std::make_pair(bcf, irf));

From 6bbb81e3f5aa0e4e01bea75c9090d40f890230b3 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 17 Oct 2023 11:23:56 -0400
Subject: [PATCH 021/198] variable renaming

---
 include/flexflow/batch_config.h               |  2 +-
 include/flexflow/request_manager.h            |  2 +-
 src/ops/inc_multihead_self_attention.cpp      |  2 +-
 src/ops/inc_multihead_self_attention.cu       |  2 +-
 src/ops/spec_inc_multihead_self_attention.cpp |  2 +-
 src/ops/spec_inc_multihead_self_attention.cu  |  2 +-
 src/runtime/batch_config.cc                   |  6 +-
 src/runtime/beam_search_batch_config.cc       |  4 +-
 src/runtime/request_manager.cc                | 66 +++++++++++--------
 src/runtime/tree_verify_batch_config.cc       |  4 +-
 10 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 108bc8d172..25bc206bf9 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -61,7 +61,7 @@ class BatchConfig {
   int num_tokens;
 
   struct PerRequestInfo {
-    int token_start_offset;
+    int first_token_depth_in_request;
     int num_tokens_in_batch;
     int max_sequence_length;
     RequestGuid request_guid;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3081aaa1c2..baf6844801 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -154,7 +154,7 @@ class RequestManager {
   std::vector<std::pair<BatchConfig::TokenId, int>>
       traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
                          int request_index,
-                         int token_start_offset);
+                         int first_token_depth_in_request);
 
   // remove guid after put the cached tree in request
   std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 562898a220..37cc986f5e 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -532,7 +532,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].token_start_offset +
+    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
     // bc->token_last_available_idx[i] + 1;
     // Compute (QK^T/sqrt(d_k))
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 00d45a9cfa..6ec077c328 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -531,7 +531,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].token_start_offset +
+    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
     // bc->token_last_available_idx[i] + 1;
     // Compute (QK^T/sqrt(d_k))
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 173d4a5b1d..1d81ae0c11 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -231,7 +231,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       // int total_tokens = bc->token_last_available_idx[i] + 1;
 
       int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int total_tokens = bc->requestsInfo[i].token_start_offset +
+      int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                          bc->requestsInfo[i].num_tokens_in_batch;
       // Compute (QK^T/sqrt(d_k))
       int m_ = num_new_tokens;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 00eec96824..8b89acf3b7 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -248,7 +248,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       // int total_tokens = bc->token_last_available_idx[i] + 1;
 
       int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int total_tokens = bc->requestsInfo[i].token_start_offset +
+      int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                          bc->requestsInfo[i].num_tokens_in_batch;
 
       if (num_new_tokens <= 0) {
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 72572c4e06..4781f09cab 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -27,7 +27,7 @@ using Legion::Memory;
 
 BatchConfig::BatchConfig() : num_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
-    requestsInfo[i].token_start_offset = 0;
+    requestsInfo[i].first_token_depth_in_request = 0;
     requestsInfo[i].num_tokens_in_batch = 0;
     request_completed[i] = true;
   }
@@ -104,8 +104,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
     if (!bc.request_completed[i]) {
       os << "  Request " << i << ":\n";
-      os << "    Token start offset: " << bc.requestsInfo[i].token_start_offset
-         << std::endl;
+      os << "    Token start offset: "
+         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index 811ef00ba2..f785dc5b74 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -126,8 +126,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
     if (!bc.request_completed[i]) {
       os << "  Request " << i << ":\n";
-      os << "    Token start offset: " << bc.requestsInfo[i].token_start_offset
-         << std::endl;
+      os << "    Token start offset: "
+         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b5688c07e6..1c5a6ae5da 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -367,7 +367,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         Request new_request = pending_request_queue.front();
         pending_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
-        new_bc.requestsInfo[i].token_start_offset = 0;
+        new_bc.requestsInfo[i].first_token_depth_in_request = 0;
         new_bc.requestsInfo[i].request_guid = new_request.guid;
         new_bc.requestsInfo[i].num_tokens_in_batch =
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
@@ -382,7 +382,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         profile_info.start_time = Realm::Clock::current_time_in_microseconds();
         profiling_requests[new_request.guid] = profile_info;
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].token_start_offset + j;
+          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
           assert(depth < new_request.tokens.size());
@@ -397,8 +397,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     } else {
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-      int processed_tokens = old_bc.requestsInfo[i].token_start_offset +
-                             old_bc.requestsInfo[i].num_tokens_in_batch;
+      int processed_tokens =
+          old_bc.requestsInfo[i].first_token_depth_in_request +
+          old_bc.requestsInfo[i].num_tokens_in_batch;
       assert(processed_tokens < request.tokens.size());
       bool request_completed = false;
       // printf("model_type = %d\n", this->model_type);
@@ -464,12 +465,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 
       } else {
         new_bc.request_completed[i] = false;
-        new_bc.requestsInfo[i].token_start_offset = processed_tokens;
+        new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
         new_bc.requestsInfo[i].max_sequence_length =
             old_bc.requestsInfo[i].max_sequence_length;
-        if (new_bc.requestsInfo[i].token_start_offset + 1 ==
+        if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
             request.tokens.size()) {
           // Incremental phase
           new_bc.requestsInfo[i].num_tokens_in_batch = 1;
@@ -478,10 +479,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.requestsInfo[i].num_tokens_in_batch =
               std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                        (int)request.tokens.size() -
-                           new_bc.requestsInfo[i].token_start_offset);
+                           new_bc.requestsInfo[i].first_token_depth_in_request);
         }
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].token_start_offset + j;
+          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
           assert(depth < request.tokens.size());
@@ -685,7 +686,7 @@ BeamSearchBatchConfig
         new_bc.request_running[i] = true;
 
         // Normal Request Info
-        new_bc.requestsInfo[i].token_start_offset =
+        new_bc.requestsInfo[i].first_token_depth_in_request =
             verified_tokens.front().second;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
@@ -694,9 +695,10 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
 
         // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-        int new_max_depth = new_bc.requestsInfo[i].max_sequence_length -
-                            new_bc.requestsInfo[i].token_start_offset -
-                            verified_tokens.size();
+        int new_max_depth =
+            new_bc.requestsInfo[i].max_sequence_length -
+            new_bc.requestsInfo[i].first_token_depth_in_request -
+            verified_tokens.size();
         new_bc.beamRequestsInfo[i].current_depth = 1;
         new_bc.beamRequestsInfo[i].beam_size =
             BeamSearchBatchConfig::MAX_BEAM_WIDTH;
@@ -742,7 +744,8 @@ BeamSearchBatchConfig
       assert(request.ssm_cache_size == request.initial_len);
 
       // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset = request.ssm_cache_size;
+      new_bc.requestsInfo[i].first_token_depth_in_request =
+          request.ssm_cache_size;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
       new_bc.requestsInfo[i].max_sequence_length =
           old_bc.requestsInfo[i].max_sequence_length;
@@ -776,7 +779,7 @@ BeamSearchBatchConfig
         Request new_request = pending_request_queue.front();
         pending_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
-        new_bc.requestsInfo[i].token_start_offset = 0;
+        new_bc.requestsInfo[i].first_token_depth_in_request = 0;
         new_bc.requestsInfo[i].request_guid = new_request.guid;
         new_bc.requestsInfo[i].num_tokens_in_batch =
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
@@ -806,7 +809,7 @@ BeamSearchBatchConfig
         new_bc.sub_requests[i] = 1;
 
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].token_start_offset + j;
+          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
           assert(depth < new_request.tokens.size());
@@ -922,7 +925,7 @@ BeamSearchBatchConfig
     // zero when beam search has reached required sequence length
     // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
     Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-    int processed_tokens = old_bc.requestsInfo[i].token_start_offset +
+    int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
                            old_bc.requestsInfo[i].num_tokens_in_batch;
 
     // assert(processed_tokens < request.tokens.size());
@@ -937,7 +940,8 @@ BeamSearchBatchConfig
     //   //                   old_bc.beamRequestsInfo[i].max_depth);
     //   // // new_bc.request_completed[i] = true;
     //   // new_bc.request_completed[i] = false;
-    //   // new_bc.requestsInfo[i].token_start_offset = processed_tokens;
+    //   // new_bc.requestsInfo[i].first_token_depth_in_request =
+    //   processed_tokens;
     //   // new_bc.requestsInfo[i].request_guid =
     //   // old_bc.requestsInfo[i].request_guid;
     //   // new_bc.requestsInfo[i].max_sequence_length =
@@ -953,7 +957,7 @@ BeamSearchBatchConfig
       log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
                           << new_bc.num_tokens;
       new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].token_start_offset = processed_tokens;
+      new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
       new_bc.requestsInfo[i].max_sequence_length =
           old_bc.requestsInfo[i].max_sequence_length;
@@ -986,7 +990,8 @@ BeamSearchBatchConfig
       // do the slot exchange to minimize the cache exchange in kernel.
       // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
       // i);
-      if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) {
+      if (new_bc.requestsInfo[i].first_token_depth_in_request >=
+          request.tokens.size()) {
         // Incremental phase
         if (request.status == Request::RUNNING) {
           new_bc.requestsInfo[i].num_tokens_in_batch = 1;
@@ -1006,7 +1011,7 @@ BeamSearchBatchConfig
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
                          BatchConfig::max_requests_per_batch() + i,
                      (int)request.tokens.size() -
-                         new_bc.requestsInfo[i].token_start_offset);
+                         new_bc.requestsInfo[i].first_token_depth_in_request);
         request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
         if (verbose) {
           std::cout << "[ Beam Spec] " << request.guid << std::endl;
@@ -1027,7 +1032,7 @@ BeamSearchBatchConfig
 
       // register more tokens due to the beam width
       for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-        int depth = new_bc.requestsInfo[i].token_start_offset + j;
+        int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
         for (int k = 0; k < new_bc.sub_requests[i]; k++) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
@@ -1151,7 +1156,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       }
 
       // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset =
+      new_bc.requestsInfo[i].first_token_depth_in_request =
           dfs_tree_inputs.front().second;
       new_bc.requestsInfo[i].request_guid =
           old_batches.at(0).requestsInfo[i].request_guid;
@@ -1204,7 +1209,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         break;
       }
 
-      new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
+      new_bc.requestsInfo[i].first_token_depth_in_request =
+          request.tokens.size() - 1;
 
       // Add Tokens from the DFS Tree to the next batch
       for (int j = 1; j < dfs_tree_inputs.size(); j++) {
@@ -1257,7 +1263,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       }
 
       // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size;
+      new_bc.requestsInfo[i].first_token_depth_in_request =
+          request.llm_cache_size;
       new_bc.requestsInfo[i].request_guid =
           old_batches.at(0).requestsInfo[i].request_guid;
       new_bc.requestsInfo[i].max_sequence_length =
@@ -1265,9 +1272,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
       new_bc.request_completed[i] = false;
 
-      new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-          max_prompt_load_size,
-          (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
+      new_bc.requestsInfo[i].num_tokens_in_batch =
+          std::min(max_prompt_load_size,
+                   (int)request.initial_len -
+                       new_bc.requestsInfo[i].first_token_depth_in_request);
       max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch;
 
       std::cout << "max_prompt_load_size: " << max_prompt_load_size
@@ -1673,7 +1681,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 std::vector<std::pair<BatchConfig::TokenId, int>>
     RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
                                        int request_index,
-                                       int token_start_offset) {
+                                       int first_token_depth_in_request) {
   if (verbose) {
     std::cout << "[Traverse Beam Tree] request_index: " << request_index
               << "\n";
@@ -1709,7 +1717,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
               << serializedTree.size() << "\n";
   }
   for (int k = 0; k < serializedTree.size(); k++) {
-    serializedTree.at(k).second += token_start_offset;
+    serializedTree.at(k).second += first_token_depth_in_request;
     if (verbose) {
       std::cout << "token id: " << serializedTree.at(k).first
                 << ", depth: " << serializedTree.at(k).second << "\n";
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index cb68ecc5f1..6dbcaceaa4 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -47,8 +47,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
     if (!bc.request_completed[i]) {
       os << "  Request " << i << ":\n";
-      os << "    Token start offset: " << bc.requestsInfo[i].token_start_offset
-         << std::endl;
+      os << "    Token start offset: "
+         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;

From 54084c430446a70c520d9240a8443ad905f22e72 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 17 Oct 2023 18:46:42 -0400
Subject: [PATCH 022/198] resolve conflict

---
 include/flexflow/model.h    |   1 +
 src/ops/kernels/softmax.cu  |   3 +-
 src/ops/layer_norm.cc       |  41 +++++------
 src/ops/layer_norm.cu       |  45 +++++++-----
 src/ops/softmax.cc          | 135 ++++++++++++++++++++++++++----------
 src/runtime/batch_config.cc |   5 --
 src/runtime/model.cc        |  16 +++++
 7 files changed, 162 insertions(+), 84 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index faf969efb7..54a4cb1d37 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -132,6 +132,7 @@ enum TaskIDs {
   SOFTMAX_FWD_TASK_ID,
   SOFTMAX_BWD_TASK_ID,
   SOFTMAX_INF_TASK_ID,
+  SOFTMAX_PEFT_BWD_TASK_ID,
   CONCAT_INIT_TASK_ID,
   CONCAT_FWD_TASK_ID,
   CONCAT_BWD_TASK_ID,
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index e31d508c95..f43bdfccbc 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -230,7 +230,8 @@ void forward_kernel(SoftmaxMeta const *m,
 }
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      cudaStream_t stream) {
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index b0d196a7c4..e9f8feae2b 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -866,23 +866,8 @@ void LayerNorm::backward_task(Task const *task,
   } else {
     assert(regions.size() == 3);
   }
-  if (m->output_type[0] == DT_FLOAT) {
-    LayerNorm::backward_kernel_wrapper<float>(m,
-                                              output_grad.get_float_ptr(),
-                                              input.get_float_ptr(),
-                                              input_grad.get_float_ptr(),
-                                              gamma.get_float_ptr(),
-                                              gamma_grad.get_float_ptr(),
-                                              beta_grad.get_float_ptr());
-  } else {
-    LayerNorm::backward_kernel_wrapper<half>(m,
-                                             output_grad.get_half_ptr(),
-                                             input.get_half_ptr(),
-                                             input_grad.get_half_ptr(),
-                                             gamma.get_half_ptr(),
-                                             gamma_grad.get_half_ptr(),
-                                             beta_grad.get_half_ptr());
-  }
+  LayerNorm::backward_kernel_wrapper(
+      m, output_grad, input, input_grad, gamma, gamma_grad, beta_grad);
 }
 
 bool LayerNorm::measure_operator_cost(Simulator *sim,
@@ -933,16 +918,24 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
   if (sim->computationMode == COMP_MODE_TRAINING) {
     float *in_grad_ptr =
         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW in_grad_acc(
+        inputs[0]->data_type, input_domain, in_grad_ptr);
     assert(in_grad_ptr != NULL);
     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
     float *out_grad_ptr = NULL;
     out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+    GenericTensorAccessorR out_grad_acc(
+        outputs[0]->data_type, output_domain, out_grad_ptr);
     assert(out_grad_ptr != NULL);
     cost_metrics.outputs_memory +=
         cost_metrics.total_mem_diff_from(sim->offset);
 
     float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
+    GenericTensorAccessorW gamma_grad_acc(
+        outputs[0]->data_type, output_domain, gamma_grad_ptr);
+    GenericTensorAccessorW beta_grad_acc(
+        outputs[0]->data_type, output_domain, beta_grad_ptr);
 
     out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) ||
                     (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) &&
@@ -954,13 +947,13 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
     }
 
     backward = [=] {
-      backward_kernel_wrapper<float>(m,
-                                     out_grad_ptr,
-                                     in_ptr,
-                                     in_grad_ptr,
-                                     gamma_ptr,
-                                     gamma_grad_ptr,
-                                     beta_grad_ptr);
+      backward_kernel_wrapper(m,
+                              out_grad_acc,
+                              input1_acc,
+                              in_grad_acc,
+                              gamma_acc,
+                              gamma_grad_acc,
+                              beta_grad_acc);
     };
   }
 
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index a59fa39b78..e242904775 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -718,24 +718,37 @@ void LayerNorm::peft_bwd_kernel_wrapper(
 }
 
 /*static*/
-template <typename T>
-void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
-                                        T const *output_grad_ptr,
-                                        T const *input_ptr,
-                                        T *input_grad_ptr,
-                                        T const *gamma_ptr,
-                                        T *gamma_grad_ptr,
-                                        T *beta_grad_ptr) {
+void LayerNorm::backward_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  LayerNorm::backward_kernel<T>(m,
-                                output_grad_ptr,
-                                input_ptr,
-                                input_grad_ptr,
-                                gamma_ptr,
-                                gamma_grad_ptr,
-                                beta_grad_ptr,
-                                stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               gamma_grad.get_float_ptr(),
+                               beta_grad.get_float_ptr(),
+                               stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               gamma_grad.get_half_ptr(),
+                               beta_grad.get_half_ptr(),
+                               stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
 }
 
 } // namespace FlexFlow
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 9e0f68c906..d0e38c8017 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -233,44 +233,6 @@ OpMeta *Softmax::init_task(Task const *task,
   return m;
 }
 
-FutureMap Softmax::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  size_t machine_view_hash = view->hash();
-  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
-            << std::endl; */
-  IndexLauncher launcher(SOFTMAX_INF_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  return runtime->execute_index_space(ctx, launcher);
-}
-
 void Softmax::forward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -358,6 +320,44 @@ void Softmax::backward_task(Task const *task,
   backward_kernel_wrapper(m, input_grad, output_grad);
 }
 
+FutureMap Softmax::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SOFTMAX_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void Softmax::inference_task(Task const *task,
                              std::vector<PhysicalRegion> const &regions,
                              Context ctx,
@@ -385,6 +385,65 @@ void Softmax::inference_task(Task const *task,
   }
 }
 
+FutureMap Softmax::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void Softmax::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
+    return;
+  }
+  Domain in_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+}
+
 bool Softmax::get_int_parameter(PMParameter para, int *value) const {
   switch (para) {
     case PM_SOFTMAX_DIM:
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 425b8eeda3..1a6e32e582 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -116,15 +116,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
     if (!bc.request_completed[i]) {
       os << "  Request " << i << ":\n";
-<<<<<<< HEAD
       os << "    First token depth in request: "
          << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
       os << "    First token offset in batch: "
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
-=======
-      os << "    Token start offset: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
->>>>>>> 4c06a0907ec694b21a989a51120e846d0f0cfa74
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 69a7f3786f..e94606718a 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5627,6 +5627,22 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Softmax::inference_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(SOFTMAX_PEFT_BWD_TASK_ID,
+                                   "Softmax PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Softmax::peft_bwd_task>(
+          registrar, "Softmax PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Softmax::peft_bwd_task>(registrar);
+    }
+  }
+
   // compute Loss
   {
     TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward");

From a44e33dde3a310ffb493fc603927cb40d1dbbc29 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 17 Oct 2023 23:00:04 -0400
Subject: [PATCH 023/198] add missing functions

---
 include/flexflow/model.h                      |   4 +-
 .../ops/inc_multihead_self_attention.h        |   9 +
 include/flexflow/parallel_ops/allreduce.h     |  19 +-
 src/ops/fused.cu                              |   6 -
 src/ops/inc_multihead_self_attention.cc       | 124 +++++++++
 src/ops/inc_multihead_self_attention.cu       |  76 ++++++
 src/ops/kernels/softmax.cu                    |   2 +-
 src/parallel_ops/allreduce.cc                 | 250 +++++++++++-------
 src/parallel_ops/kernels/allreduce_kernels.cu |  51 +++-
 src/runtime/inference_manager.cc              |   3 +
 src/runtime/model.cc                          |  60 ++++-
 11 files changed, 470 insertions(+), 134 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 54a4cb1d37..ac24e90900 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -177,6 +177,7 @@ enum TaskIDs {
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+  INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
@@ -241,9 +242,10 @@ enum TaskIDs {
   PIPELINE_FWD_TASK_ID,
   PIPELINE_BWD_TASK_ID,
   ALLREDUCE_INIT_TASK_ID,
-  ALLREDUCE_INF_TASK_ID,
   ALLREDUCE_FWD_TASK_ID,
   ALLREDUCE_BWD_TASK_ID,
+  ALLREDUCE_INF_TASK_ID,
+  ALLREDUCE_PEFT_BWD_TASK_ID,
   FUSED_PARALLELOP_INIT_TASK_ID,
   FUSED_PARALLELOP_FWD_TASK_ID,
   FUSED_PARALLELOP_BWD_TASK_ID,
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index c220091174..76569de4cb 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -96,6 +96,11 @@ class IncMultiHeadSelfAttention : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -109,6 +114,10 @@ class IncMultiHeadSelfAttention : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h
index 045f9b36a0..7e0e4362e2 100644
--- a/include/flexflow/parallel_ops/allreduce.h
+++ b/include/flexflow/parallel_ops/allreduce.h
@@ -34,12 +34,17 @@ class AllReduce : public ParallelOp {
                       std::vector<ParallelTensor> const &,
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &bc,
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
-  void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
       std::vector<ParallelOpInfo> &parallel_ops) const override;
@@ -47,10 +52,6 @@ class AllReduce : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static void inference_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
@@ -59,6 +60,14 @@ class AllReduce : public ParallelOp {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 88eefc7e82..692316c6d4 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1187,13 +1187,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(false && "Fusion currently does not support type");
       }
     }
-    ioff += fused->op_num_inputs[op];
-    woff += fused->op_num_weights[op];
-    ooff += fused->op_num_outputs[op];
   }
-  // for (int i = 0; i < fused->numOutputs; i++)
-  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-  //   "[Fused:forward:output]");
 }
 
 /*
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index c8e7ba72f4..5cf4dbdf7c 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -876,6 +876,130 @@ void IncMultiHeadSelfAttention::inference_task(
   }
 }
 
+FutureMap IncMultiHeadSelfAttention::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  int idx = 0;
+  IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(idx++, FID_DATA);
+  if (qkv_bias || final_bias) {
+    launcher.add_region_requirement(
+        RegionRequirement(weights[1]->part,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          weights[1]->region,
+                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+    launcher.add_field(idx++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): input
+  regions[3](I): weight
+  regions[4](O): output
+*/
+void IncMultiHeadSelfAttention::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
+                    bc->num_tokens,
+                    bc->num_active_requests());
+  if (bc->num_tokens == 0) {
+    return;
+  }
+
+  IncMultiHeadSelfAttentionMeta *m =
+      *((IncMultiHeadSelfAttentionMeta **)task->local_args);
+
+  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
+                                           : regions.size() == 3));
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR biases;
+  if (*m->qkv_bias || *m->final_bias) {
+    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    Domain bias_domain = runtime->get_index_space_domain(
+        ctx, task->regions[3].region.get_index_space());
+    assert(bias_domain.get_dim() == 4);
+  }
+
+  Domain input_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain weight_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  Domain output_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[2].region.get_index_space());
+
+  assert(input_grad_domain.get_dim() == 4);
+  assert(weight_domain.get_dim() == 2);
+  assert(output_grad_domain.get_dim() == 4);
+
+  assert(task->index_point.get_dim() == 1);
+
+  IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+      m,
+      bc,
+      task->index_point.point_data[0],
+      input_grad,
+      weight,
+      output_grad,
+      biases);
+}
+
 void IncMultiHeadSelfAttention::backward(FFModel const &ff) {
   // IncMultiHeadSelfAttention does not support backward
   assert(false);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 6cc0796c85..d92862ba30 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -446,6 +446,18 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
       m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
 }
 
+template <typename DT>
+void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                     BatchConfig const *bc,
+                     int shard_id,
+                     DT *input_grad_ptr,
+                     DT const *weight_ptr,
+                     DT const *output_grad_ptr,
+                     DT const *bias_ptr,
+                     cudaStream_t stream) {
+  assert(false);
+}
+
 } // namespace IncMultiHeadAttention
 } // namespace Kernels
 
@@ -842,6 +854,70 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &bias) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  bool use_bias = *m->qkv_bias || *m->final_bias;
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // assert(input.data_type == weight.data_type);
+  assert(input_grad.data_type == output_grad.data_type);
+  if (use_bias) {
+    assert(input_grad.data_type == bias.data_type);
+  }
+
+  if (input_grad.data_type == DT_HALF) {
+    assert(!m->offload);
+    half const *bias_ptr =
+        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_half_ptr(),
+                                                    weight.get_half_ptr(),
+                                                    output_grad.get_half_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else if (input_grad.data_type == DT_FLOAT) {
+    assert(m->offload);
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_float_ptr(),
+                                                    weight.get_float_ptr(),
+                                                    output_grad.get_float_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed);
+  }
+}
+
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index f43bdfccbc..9ccce40c58 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -24,7 +24,7 @@ using Legion::Domain;
 SoftmaxMeta::SoftmaxMeta(FFHandler handler,
                          Softmax const *softmax,
                          Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, softmax) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
       inputTensor, input_domain, softmax->data_type));
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 027d15c929..62e152b36c 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -143,6 +143,102 @@ void AllReduce::init(FFModel const &ff) {
   set_opmeta_from_futuremap(ff, fm);
 }
 
+void AllReduce::forward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_forward(ff, argmap);
+  IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID,
+                         outputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void AllReduce::forward_task(Task const *task,
+                             std::vector<PhysicalRegion> const &regions,
+                             Context ctx,
+                             Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  forward_kernel_wrapper(m, input, output);
+}
+
+void AllReduce::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID,
+                         inputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         inputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void AllReduce::backward_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  backward_kernel_wrapper(m, input_grad, output_grad);
+}
+
 void AllReduce::init_inference(FFModel const &ff,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
@@ -221,64 +317,84 @@ FutureMap AllReduce::inference(FFModel const &ff,
   return runtime->execute_index_space(ctx, launcher);
 }
 
-void AllReduce::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = outputs[0]->parallel_is;
-  assert(numOutputs == 1);
-  assert(numInputs == 1);
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID,
-                         outputs[0]->parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+/*static*/
+void AllReduce::inference_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  inference_kernel_wrapper(m, bc, input, output);
 }
 
-void AllReduce::backward(FFModel const &ff) {
+FutureMap AllReduce::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
   assert(numOutputs == 1);
   assert(numInputs == 1);
-  IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID,
-                         inputs[0]->parallel_is,
-                         TaskArgument(NULL, 0),
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
                          argmap,
                          Predicate::TRUE_PRED,
                          false /*must*/,
                          0 /*mapper_id*/,
-                         inputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
-                                                    inputs[0]->region_grad));
+                                                    batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
-                                                    outputs[0]->region_grad));
+                                                    batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void AllReduce::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
 }
 
 bool AllReduce::measure_operator_cost(Simulator *sim,
@@ -315,62 +431,6 @@ bool AllReduce::append_parallel_op_info(
   return true;
 }
 
-/*static*/
-void AllReduce::inference_task(Task const *task,
-                               std::vector<PhysicalRegion> const &regions,
-                               Context ctx,
-                               Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input.data_type == output.data_type);
-  inference_kernel_wrapper(m, bc, input, output);
-}
-
-/*static*/
-void AllReduce::forward_task(Task const *task,
-                             std::vector<PhysicalRegion> const &regions,
-                             Context ctx,
-                             Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input.data_type == output.data_type);
-  forward_kernel_wrapper(m, input, output);
-}
-
-void AllReduce::backward_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input_grad.data_type == output_grad.data_type);
-  backward_kernel_wrapper(m, input_grad, output_grad);
-}
-
 }; // namespace FlexFlow
 
 namespace std {
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2c000137a1..5861f05d7a 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -24,21 +24,18 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+void forward_kernel_wrapper(AllReduceMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
-  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          num_elements,
+                          input.domain.get_volume(),
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -48,18 +45,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
-                            GenericTensorAccessorR const &input,
-                            GenericTensorAccessorW const &output) {
+void backward_kernel_wrapper(AllReduceMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          input.domain.get_volume(),
+                          num_elements,
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -69,10 +75,29 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void backward_kernel_wrapper(AllReduceMeta const *m,
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
-  assert(false && "To be implemented");
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
 }
 
 } // namespace AllReduce
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 5d81fa4664..0f71291ded 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -395,6 +395,9 @@ void InferenceManager::peft_bwd(FFModel *model,
     if (op->op_type == OP_WEIGHT) {
       continue;
     }
+    if (op->op_type == OP_INPUT) {
+      continue;
+    }
     std::vector<ParallelTensor> inputs(op->numInputs);
     std::vector<ParallelTensor> outputs(op->numOutputs);
     for (int i = 0; i < op->numInputs; i++) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index e94606718a..04a847b023 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6229,6 +6229,24 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(
+        INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+        "IncMultiHeadSelfAttention PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          IncMultiHeadSelfAttention::peft_bwd_task>(
+          registrar, "IncMultiHeadSelfAttention PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<IncMultiHeadSelfAttention::peft_bwd_task>(
+          registrar);
+    }
+  }
   // speculative MultiHeadAttention task
   {
     TaskVariantRegistrar registrar(
@@ -6651,48 +6669,64 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID,
-                                   "AllReduce Inference");
+    TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::inference_task>(
-          registrar, "AllReduce Inference Task");
+      Runtime::preregister_task_variant<AllReduce::forward_task>(
+          registrar, "AllReduce Forward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::inference_task>(registrar);
+      runtime->register_task_variant<AllReduce::forward_task>(registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
+    TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::forward_task>(
-          registrar, "AllReduce Forward Task");
+      Runtime::preregister_task_variant<AllReduce::backward_task>(
+          registrar, "AllReduce Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::forward_task>(registrar);
+      runtime->register_task_variant<AllReduce::backward_task>(registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
+    TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID,
+                                   "AllReduce Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::backward_task>(
-          registrar, "AllReduce Backward Task");
+      Runtime::preregister_task_variant<AllReduce::inference_task>(
+          registrar, "AllReduce Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::backward_task>(registrar);
+      runtime->register_task_variant<AllReduce::inference_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(ALLREDUCE_PEFT_BWD_TASK_ID,
+                                   "AllReduce PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<AllReduce::peft_bwd_task>(
+          registrar, "AllReduce PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AllReduce::peft_bwd_task>(registrar);
     }
   }
+
   // FusedParallelOp
   {
     TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID,

From 4d55b4079dc3612e5f0206f6f0a4161f22230b3d Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 18 Oct 2023 00:50:40 -0400
Subject: [PATCH 024/198] remove OpMeta(FFhandler) constructor

---
 include/flexflow/op_meta.h                    |  2 +-
 include/flexflow/ops/aggregate.h              |  4 +-
 include/flexflow/ops/aggregate_spec.h         |  4 +-
 include/flexflow/ops/cache.h                  |  4 +-
 include/flexflow/ops/element_unary.h          |  4 +-
 include/flexflow/ops/experts.h                | 15 +----
 include/flexflow/ops/groupby.h                |  4 +-
 .../ops/kernels/batch_matmul_kernels.h        |  4 +-
 include/flexflow/ops/kernels/cast_kernels.h   |  4 +-
 include/flexflow/ops/kernels/concat_kernels.h |  4 +-
 .../flexflow/ops/kernels/conv_2d_kernels.h    |  4 +-
 include/flexflow/ops/kernels/flat_kernels.h   |  4 +-
 .../flexflow/ops/kernels/pool_2d_kernels.h    |  4 +-
 .../flexflow/ops/kernels/reshape_kernels.h    |  6 +-
 .../flexflow/ops/kernels/transpose_kernels.h  |  4 +-
 include/flexflow/ops/topk.h                   |  4 +-
 include/flexflow/ops/transpose.h              |  2 +
 .../parallel_ops/kernels/combine_kernels.h    |  4 +-
 .../parallel_ops/kernels/partition_kernels.h  |  4 +-
 include/flexflow/simulator.h                  | 56 +++++++++----------
 src/ops/add_bias_residual_layer_norm.cu       |  2 +-
 src/ops/aggregate.cc                          |  4 +-
 src/ops/aggregate.cpp                         |  9 +--
 src/ops/aggregate.cu                          |  7 ++-
 src/ops/aggregate_spec.cc                     |  4 +-
 src/ops/aggregate_spec.cpp                    |  7 ++-
 src/ops/aggregate_spec.cu                     |  7 ++-
 src/ops/attention.cpp                         |  2 +-
 src/ops/attention.cu                          |  2 +-
 src/ops/batch_matmul.cc                       |  4 +-
 src/ops/batch_norm.cpp                        |  2 +-
 src/ops/batch_norm.cu                         |  2 +-
 src/ops/beam_topk.cpp                         |  2 +-
 src/ops/beam_topk.cu                          |  2 +-
 src/ops/cache.cc                              |  2 +-
 src/ops/cache.cpp                             |  2 +-
 src/ops/cache.cu                              |  2 +-
 src/ops/cast.cc                               |  2 +-
 src/ops/concat.cc                             |  4 +-
 src/ops/conv_2d.cc                            |  4 +-
 src/ops/element_unary.cc                      |  4 +-
 src/ops/element_unary.cpp                     |  3 +-
 src/ops/element_unary.cu                      |  3 +-
 src/ops/experts.cc                            | 13 +----
 src/ops/experts.cpp                           | 28 +++-------
 src/ops/experts.cu                            | 27 +++------
 src/ops/flat.cc                               |  3 +-
 src/ops/group_by.cc                           |  4 +-
 src/ops/group_by.cpp                          |  6 +-
 src/ops/group_by.cu                           |  6 +-
 src/ops/kernels/batch_matmul.cpp              |  4 +-
 src/ops/kernels/batch_matmul.cu               |  4 +-
 src/ops/kernels/cast_kernels.cpp              |  3 +-
 src/ops/kernels/cast_kernels.cu               |  3 +-
 src/ops/kernels/concat_kernels.cpp            |  4 ++
 src/ops/kernels/concat_kernels.cu             |  4 ++
 src/ops/kernels/conv_2d_kernels.cpp           |  4 +-
 src/ops/kernels/conv_2d_kernels.cu            |  4 +-
 src/ops/kernels/dropout_kernels.cpp           |  2 +-
 src/ops/kernels/dropout_kernels.cu            |  2 +-
 src/ops/kernels/flat_kernels.cpp              |  4 ++
 src/ops/kernels/flat_kernels.cu               |  4 ++
 src/ops/kernels/pool_2d_kernels.cpp           |  4 +-
 src/ops/kernels/pool_2d_kernels.cu            |  4 +-
 src/ops/kernels/reshape_kernels.cpp           |  4 +-
 src/ops/kernels/reshape_kernels.cu            |  4 +-
 src/ops/kernels/transpose_kernels.cpp         |  4 ++
 src/ops/kernels/transpose_kernels.cu          |  4 ++
 src/ops/layer_norm.cc                         |  3 +-
 src/ops/layer_norm.cpp                        |  2 +-
 src/ops/layer_norm.cu                         |  2 +-
 src/ops/linear.cc                             |  5 +-
 src/ops/mean.cc                               |  3 +-
 src/ops/noop.cc                               |  4 +-
 src/ops/pool_2d.cc                            |  4 +-
 src/ops/reduce.cpp                            |  2 +-
 src/ops/reduce.cu                             |  2 +-
 src/ops/reshape.cc                            |  2 +-
 src/ops/residual_layer_norm.cpp               |  2 +-
 src/ops/residual_layer_norm.cu                |  2 +-
 src/ops/sigmoid_silu_multi.cpp                |  2 +-
 src/ops/sigmoid_silu_multi.cu                 |  2 +-
 src/ops/topk.cc                               |  4 +-
 src/ops/topk.cpp                              |  3 +-
 src/ops/topk.cu                               |  3 +-
 src/ops/transpose.cc                          |  4 +-
 src/parallel_ops/combine.cc                   |  2 +-
 .../kernels/allreduce_kernels.cpp             |  2 +-
 src/parallel_ops/kernels/allreduce_kernels.cu |  2 +-
 src/parallel_ops/kernels/combine_kernels.cpp  |  4 +-
 src/parallel_ops/kernels/combine_kernels.cu   |  4 +-
 .../kernels/partition_kernels.cpp             |  4 +-
 src/parallel_ops/kernels/partition_kernels.cu |  4 +-
 .../kernels/reduction_kernels.cpp             |  2 +-
 src/parallel_ops/kernels/reduction_kernels.cu |  2 +-
 .../kernels/replicate_kernels.cpp             |  2 +-
 src/parallel_ops/kernels/replicate_kernels.cu |  2 +-
 src/runtime/inference_manager.cc              | 17 ++++++
 src/runtime/model.cc                          |  9 ++-
 src/runtime/simulator.cpp                     | 22 ++++----
 src/runtime/simulator.cu                      | 26 ++++-----
 101 files changed, 305 insertions(+), 226 deletions(-)

diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index dae3953490..dcf070c975 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -9,7 +9,7 @@ class Op;
 
 class OpMeta {
 public:
-  OpMeta(FFHandler _handle);
+  // OpMeta(FFHandler _handle);
   OpMeta(FFHandler _handle, Op const *op);
 
 public:
diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h
index 3ba4f414d1..283e9a4290 100644
--- a/include/flexflow/ops/aggregate.h
+++ b/include/flexflow/ops/aggregate.h
@@ -11,9 +11,11 @@ namespace FlexFlow {
 #define AGGREGATE_MAX_BATCH_SIZE 64
 #define AGGREGATE_MAX_N 128
 
+class Aggregate;
+
 class AggregateMeta : public OpMeta {
 public:
-  AggregateMeta(FFHandler handle, int n);
+  AggregateMeta(FFHandler handle, Aggregate const *aggr);
   ~AggregateMeta(void);
   float **dev_exp_preds;
   float **dev_exp_grads;
diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h
index 4302dd0733..a9f651b620 100644
--- a/include/flexflow/ops/aggregate_spec.h
+++ b/include/flexflow/ops/aggregate_spec.h
@@ -11,9 +11,11 @@ namespace FlexFlow {
 #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32
 #define AGGREGATE_SPEC_MAX_N 12
 
+class AggregateSpec;
+
 class AggregateSpecMeta : public OpMeta {
 public:
-  AggregateSpecMeta(FFHandler handle, int n);
+  AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg);
   ~AggregateSpecMeta(void);
   float **dev_region_ptrs;
 };
diff --git a/include/flexflow/ops/cache.h b/include/flexflow/ops/cache.h
index 1fbb1fa059..4f0b94ee5c 100644
--- a/include/flexflow/ops/cache.h
+++ b/include/flexflow/ops/cache.h
@@ -5,9 +5,11 @@
 
 namespace FlexFlow {
 
+class Cache;
+
 class CacheMeta : public OpMeta {
 public:
-  CacheMeta(FFHandler handle);
+  CacheMeta(FFHandler handle, Cache const *c);
   float cache_score;
 };
 
diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h
index ddef59549c..043b5d19a7 100644
--- a/include/flexflow/ops/element_unary.h
+++ b/include/flexflow/ops/element_unary.h
@@ -12,9 +12,11 @@
 
 namespace FlexFlow {
 
+class ElementUnary;
+
 class ElementUnaryMeta : public OpMeta {
 public:
-  ElementUnaryMeta(FFHandler handle);
+  ElementUnaryMeta(FFHandler handle, ElementUnary const *unary);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor, outputTensor;
   cudnnActivationDescriptor_t actiDesc;
diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h
index f132003d30..1ed4678a5b 100644
--- a/include/flexflow/ops/experts.h
+++ b/include/flexflow/ops/experts.h
@@ -6,20 +6,11 @@
 
 namespace FlexFlow {
 
+class Experts;
+
 class ExpertsMeta : public OpMeta {
 public:
-  ExpertsMeta(FFHandler handler,
-              int _num_experts,
-              int _experts_start_idx,
-              int _data_dim,
-              int _out_dim,
-              int _experts_num_layers,
-              int _experts_internal_dim_size,
-              int _effective_batch_size,
-              int _num_chosen_experts,
-              float _alpha,
-              bool _use_bias,
-              ActiMode _activation);
+  ExpertsMeta(FFHandler handler, Experts const *e);
   ~ExpertsMeta(void);
 
   // Thrust helper arrays
diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h
index ec6cdfb9ab..73025216cd 100644
--- a/include/flexflow/ops/groupby.h
+++ b/include/flexflow/ops/groupby.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Group_by;
+
 class GroupByMeta : public OpMeta {
 public:
-  GroupByMeta(FFHandler handle, int n, float _alpha);
+  GroupByMeta(FFHandler handle, Group_by const *gb);
   ~GroupByMeta(void);
   float alpha;
   float **dev_region_ptrs;
diff --git a/include/flexflow/ops/kernels/batch_matmul_kernels.h b/include/flexflow/ops/kernels/batch_matmul_kernels.h
index 4de774ee06..c3923c4d4b 100644
--- a/include/flexflow/ops/kernels/batch_matmul_kernels.h
+++ b/include/flexflow/ops/kernels/batch_matmul_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class BatchMatmul;
+
 class BatchMatmulMeta : public OpMeta {
 public:
-  BatchMatmulMeta(FFHandler handler);
+  BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm);
   int a_seq_length_dim, b_seq_length_dim;
 };
 
diff --git a/include/flexflow/ops/kernels/cast_kernels.h b/include/flexflow/ops/kernels/cast_kernels.h
index 3001d913ca..d601601ea2 100644
--- a/include/flexflow/ops/kernels/cast_kernels.h
+++ b/include/flexflow/ops/kernels/cast_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Cast;
+
 class CastMeta : public OpMeta {
 public:
-  CastMeta(FFHandler handle);
+  CastMeta(FFHandler handle, Cast const *cast);
   DataType input_data_type, output_data_type;
 };
 
diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h
index 4da6aaf5e2..4562ae871a 100644
--- a/include/flexflow/ops/kernels/concat_kernels.h
+++ b/include/flexflow/ops/kernels/concat_kernels.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Concat;
+
 class ConcatMeta : public OpMeta {
 public:
-  ConcatMeta(FFHandler handle) : OpMeta(handle){};
+  ConcatMeta(FFHandler handle, Concat const *cc);
   int legion_axis;
 };
 
diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h
index 7b2a0fe135..f83e4687d7 100644
--- a/include/flexflow/ops/kernels/conv_2d_kernels.h
+++ b/include/flexflow/ops/kernels/conv_2d_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Conv2D;
+
 class Conv2DMeta : public OpMeta {
 public:
-  Conv2DMeta(FFHandler handler);
+  Conv2DMeta(FFHandler handler, Conv2D const *conv);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor;
   cudnnFilterDescriptor_t filterDesc;
diff --git a/include/flexflow/ops/kernels/flat_kernels.h b/include/flexflow/ops/kernels/flat_kernels.h
index caf817512d..6aa5a13b42 100644
--- a/include/flexflow/ops/kernels/flat_kernels.h
+++ b/include/flexflow/ops/kernels/flat_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Flat;
+
 class FlatMeta : public OpMeta {
 public:
-  FlatMeta(FFHandler handle) : OpMeta(handle){};
+  FlatMeta(FFHandler handle, Flat const *flat);
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h
index 7f73a8295d..c5a954763e 100644
--- a/include/flexflow/ops/kernels/pool_2d_kernels.h
+++ b/include/flexflow/ops/kernels/pool_2d_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Pool2D;
+
 class Pool2DMeta : public OpMeta {
 public:
-  Pool2DMeta(FFHandler handle);
+  Pool2DMeta(FFHandler handle, Pool2D const *pool);
   ffTensorDescriptor_t inputTensor, outputTensor;
   ffActivationDescriptor_t actiDesc;
   ffPoolingDescriptor_t poolDesc;
diff --git a/include/flexflow/ops/kernels/reshape_kernels.h b/include/flexflow/ops/kernels/reshape_kernels.h
index e6c8c4d569..5b6fa5be19 100644
--- a/include/flexflow/ops/kernels/reshape_kernels.h
+++ b/include/flexflow/ops/kernels/reshape_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Reshape;
+
 class ReshapeMeta : public OpMeta {
 public:
-  ReshapeMeta(FFHandler handler);
+  ReshapeMeta(FFHandler handler, Reshape const *reshape);
   DataType data_type;
 };
 
@@ -44,4 +46,4 @@ void backward_kernel(T *input_grad_ptr,
 } // namespace Kernels
 } // namespace FlexFlow
 
-#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
\ No newline at end of file
+#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/include/flexflow/ops/kernels/transpose_kernels.h b/include/flexflow/ops/kernels/transpose_kernels.h
index 7ff6163b30..a2c8ff0483 100644
--- a/include/flexflow/ops/kernels/transpose_kernels.h
+++ b/include/flexflow/ops/kernels/transpose_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Transpose;
+
 class TransposeMeta : public OpMeta {
 public:
-  TransposeMeta(FFHandler handler) : OpMeta(handler){};
+  TransposeMeta(FFHandler handler, Transpose const *transpose);
   int num_dim;
   int perm[MAX_TENSOR_DIM];
 };
diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h
index 47144bf6d7..4b67692032 100644
--- a/include/flexflow/ops/topk.h
+++ b/include/flexflow/ops/topk.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class TopK;
+
 class TopKMeta : public OpMeta {
 public:
-  TopKMeta(FFHandler handle);
+  TopKMeta(FFHandler handle, TopK const *topk);
   bool sorted;
 };
 
diff --git a/include/flexflow/ops/transpose.h b/include/flexflow/ops/transpose.h
index 3e6fb575c0..bca0b83460 100644
--- a/include/flexflow/ops/transpose.h
+++ b/include/flexflow/ops/transpose.h
@@ -6,6 +6,8 @@
 
 namespace FlexFlow {
 
+class TransposeMeta;
+
 class Transpose : public Op {
 public:
   using Params = TransposeParams;
diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h
index 456013cd81..4b2227b178 100644
--- a/include/flexflow/parallel_ops/kernels/combine_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Combine;
+
 class CombineMeta : public OpMeta {
 public:
-  CombineMeta(FFHandler handle);
+  CombineMeta(FFHandler handle, Combine const *comb);
   DataType data_type;
 };
 
diff --git a/include/flexflow/parallel_ops/kernels/partition_kernels.h b/include/flexflow/parallel_ops/kernels/partition_kernels.h
index 81b190603a..1e77090d11 100644
--- a/include/flexflow/parallel_ops/kernels/partition_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/partition_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Repartition;
+
 class RepartitionMeta : public OpMeta {
 public:
-  RepartitionMeta(FFHandler handle);
+  RepartitionMeta(FFHandler handle, Repartition const *repart);
   DataType data_type;
 };
 
diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h
index e410f66325..6cda96aa8b 100644
--- a/include/flexflow/simulator.h
+++ b/include/flexflow/simulator.h
@@ -33,21 +33,21 @@ namespace FlexFlow {
 
 #define MOD(a, b) ((a) % (b)) < 0 ? ((a) % (b)) + (b) : ((a) % (b))
 
-class Conv2DMeta;
-class LinearMeta;
-class Pool2DMeta;
-class ElementUnaryMeta;
-class ElementBinaryMeta;
-class LayerNormMeta;
-// class EmbeddingMeta;
-// class SoftmaxMeta;
-class BatchMatmulMeta;
-// class BatchNormMeta;
-class ConcatMeta;
-// class DropoutMeta;
-class TransposeMeta;
-class Op;
-class FFModel;
+// class Conv2DMeta;
+// class LinearMeta;
+// class Pool2DMeta;
+// class ElementUnaryMeta;
+// class ElementBinaryMeta;
+// class LayerNormMeta;
+//  class EmbeddingMeta;
+//  class SoftmaxMeta;
+// class BatchMatmulMeta;
+//  class BatchNormMeta;
+// class ConcatMeta;
+//  class DropoutMeta;
+// class TransposeMeta;
+// class Op;
+// class FFModel;
 
 /**
  * @brief Costs of an operator.
@@ -751,19 +751,19 @@ class Simulator {
       strict_hash_to_operator_cost;
 
 public:
-  Conv2DMeta *conv2d_meta;
-  LinearMeta *linear_meta;
-  Pool2DMeta *pool2d_meta;
-  ElementUnaryMeta *ele_unary_meta;
-  LayerNormMeta *layernorm_meta;
-  // ElementBinaryMeta *ele_binary_meta;
-  // EmbeddingMeta *embedding_meta;
-  // SoftmaxMeta *softmax_meta;
-  BatchMatmulMeta *batch_matmul_meta;
-  // BatchNormMeta *batch_norm_meta;
-  ConcatMeta *concat_meta;
-  // DropoutMeta *dropout_meta;
-  TransposeMeta *transpose_meta;
+  // Conv2DMeta *conv2d_meta;
+  // LinearMeta *linear_meta;
+  // Pool2DMeta *pool2d_meta;
+  // ElementUnaryMeta *ele_unary_meta;
+  // LayerNormMeta *layernorm_meta;
+  //  ElementBinaryMeta *ele_binary_meta;
+  //  EmbeddingMeta *embedding_meta;
+  //  SoftmaxMeta *softmax_meta;
+  // BatchMatmulMeta *batch_matmul_meta;
+  //  BatchNormMeta *batch_norm_meta;
+  // ConcatMeta *concat_meta;
+  //  DropoutMeta *dropout_meta;
+  // TransposeMeta *transpose_meta;
   int segment_size;
   int max_num_segments; // simulation could be slow if the number of segments
                         // are too large
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ceb1a6514e..07f1f2af6b 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -27,7 +27,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
     AddBiasResidualLayerNorm const *ln,
     MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 67810d3f5b..b021a50ee1 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -233,7 +233,7 @@ OpMeta *Aggregate::init_task(Task const *task,
                              Runtime *runtime) {
   Aggregate *agg = (Aggregate *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  AggregateMeta *m = new AggregateMeta(handle, agg->n);
+  AggregateMeta *m = new AggregateMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
@@ -592,7 +592,7 @@ bool Aggregate::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  AggregateMeta *m = new AggregateMeta(sim->handler, n);
+  AggregateMeta *m = new AggregateMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp
index d5ebdb0c22..5a508cfac4 100644
--- a/src/ops/aggregate.cpp
+++ b/src/ops/aggregate.cpp
@@ -281,13 +281,14 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m,
                      out_dim);
 }
 
-AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) {
-  checkCUDA(hipMalloc(&dev_exp_preds, n * sizeof(float *)));
-  checkCUDA(hipMalloc(&dev_exp_grads, n * sizeof(float *)));
+AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(hipMalloc(&dev_exp_preds, aggr->n * sizeof(float *)));
+  checkCUDA(hipMalloc(&dev_exp_grads, aggr->n * sizeof(float *)));
 }
 AggregateMeta::~AggregateMeta(void) {
   checkCUDA(hipFree(&dev_exp_preds));
   checkCUDA(hipFree(&dev_exp_grads));
 }
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu
index 38e141b252..9704302092 100644
--- a/src/ops/aggregate.cu
+++ b/src/ops/aggregate.cu
@@ -307,9 +307,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m,
   }
 }
 
-AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) {
-  checkCUDA(cudaMalloc(&dev_exp_preds, n * sizeof(float *)));
-  checkCUDA(cudaMalloc(&dev_exp_grads, n * sizeof(float *)));
+AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(cudaMalloc(&dev_exp_preds, aggr->n * sizeof(float *)));
+  checkCUDA(cudaMalloc(&dev_exp_grads, aggr->n * sizeof(float *)));
 }
 AggregateMeta::~AggregateMeta(void) {
   checkCUDA(cudaFree(&dev_exp_preds));
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 19b2edc14a..32bd56e215 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -207,7 +207,7 @@ OpMeta *AggregateSpec::init_task(Task const *task,
                                  Runtime *runtime) {
   AggregateSpec *agg = (AggregateSpec *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n);
+  AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
@@ -540,7 +540,7 @@ bool AggregateSpec::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, n);
+  AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp
index 314e20a59c..a676fa81c3 100644
--- a/src/ops/aggregate_spec.cpp
+++ b/src/ops/aggregate_spec.cpp
@@ -290,9 +290,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m,
                      out_dim);
 }
 
-AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n)
-    : OpMeta(handler) {
-  checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *)));
+AggregateSpecMeta::AggregateSpecMeta(FFHandler handler,
+                                     AggregateSpec const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(hipMalloc(&dev_region_ptrs, aggr->n * sizeof(float *)));
 }
 AggregateSpecMeta::~AggregateSpecMeta(void) {
   checkCUDA(hipFree(&dev_region_ptrs));
diff --git a/src/ops/aggregate_spec.cu b/src/ops/aggregate_spec.cu
index 8d50d45d21..ac5a372efc 100644
--- a/src/ops/aggregate_spec.cu
+++ b/src/ops/aggregate_spec.cu
@@ -287,9 +287,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m,
                                       out_dim);
 }
 
-AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n)
-    : OpMeta(handler) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
+AggregateSpecMeta::AggregateSpecMeta(FFHandler handler,
+                                     AggregateSpec const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(cudaMalloc(&dev_region_ptrs, aggr->n * sizeof(float *)));
 }
 AggregateSpecMeta::~AggregateSpecMeta(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp
index ee7f87a7fb..10655a4a1a 100644
--- a/src/ops/attention.cpp
+++ b/src/ops/attention.cpp
@@ -156,7 +156,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
                                                Memory gpu_mem,
                                                int num_samples,
                                                int num_heads)
-    : OpMeta(handler) {
+    : OpMeta(handler, attn) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
diff --git a/src/ops/attention.cu b/src/ops/attention.cu
index 9b8b90da70..59834b1300 100644
--- a/src/ops/attention.cu
+++ b/src/ops/attention.cu
@@ -194,7 +194,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
                                                Memory gpu_mem,
                                                int num_samples,
                                                int num_heads)
-    : OpMeta(handler) {
+    : OpMeta(handler, attn) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc
index f4b06877e5..77b7be2ba8 100644
--- a/src/ops/batch_matmul.cc
+++ b/src/ops/batch_matmul.cc
@@ -272,7 +272,7 @@ OpMeta *BatchMatmul::init_task(Task const *task,
                                Runtime *runtime) {
   BatchMatmul const *bmm = (BatchMatmul *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  BatchMatmulMeta *m = new BatchMatmulMeta(handle);
+  BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm);
   m->profiling = bmm->profiling;
   m->inference_debugging = bmm->inference_debugging;
   m->a_seq_length_dim = bmm->a_seq_length_dim;
@@ -609,7 +609,7 @@ bool BatchMatmul::measure_operator_cost(Simulator *sim,
     batch *= sub_input0.dims[i].size;
   }
 
-  BatchMatmulMeta *meta = sim->batch_matmul_meta;
+  BatchMatmulMeta *meta = new BatchMatmulMeta(sim->handler, this);
 
   // allocate tensors in simulator
   sim->free_all();
diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp
index 106e5ebad2..933be29197 100644
--- a/src/ops/batch_norm.cpp
+++ b/src/ops/batch_norm.cpp
@@ -287,7 +287,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
                              int output_c,
                              int output_h,
                              int output_w)
-    : OpMeta(handler) {
+    : OpMeta(handler, bn) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&biasTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu
index b77e9d489f..ffbdef9f01 100644
--- a/src/ops/batch_norm.cu
+++ b/src/ops/batch_norm.cu
@@ -273,7 +273,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
                              int output_c,
                              int output_h,
                              int output_w)
-    : OpMeta(handler) {
+    : OpMeta(handler, bn) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
index 18534455a0..a570e6ff17 100644
--- a/src/ops/beam_topk.cpp
+++ b/src/ops/beam_topk.cpp
@@ -681,7 +681,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
 BeamTopKMeta::BeamTopKMeta(FFHandler handler,
                            Op const *op,
                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
+    : OpMeta(handler, op) {
   DataType data_type = op->inputs[0]->data_type;
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index 72ab7862a6..a79070c346 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -714,7 +714,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
 BeamTopKMeta::BeamTopKMeta(FFHandler handler,
                            Op const *op,
                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
+    : OpMeta(handler, op) {
   DataType data_type = op->inputs[0]->data_type;
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
diff --git a/src/ops/cache.cc b/src/ops/cache.cc
index 691e45b559..33b862ae85 100644
--- a/src/ops/cache.cc
+++ b/src/ops/cache.cc
@@ -165,7 +165,7 @@ OpMeta *Cache::init_task(Task const *task,
                          Runtime *runtime) {
   Cache *c = (Cache *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  CacheMeta *m = new CacheMeta(handle);
+  CacheMeta *m = new CacheMeta(handle, c);
   m->cache_score = 0.0f;
   m->profiling = c->profiling;
   m->inference_debugging = c->inference_debugging;
diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp
index 95c5995f9e..a9512c2c59 100644
--- a/src/ops/cache.cpp
+++ b/src/ops/cache.cpp
@@ -75,7 +75,7 @@ float Cache::cache_update(Task const *task,
   return cache_score;
 }
 
-CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {}
+CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {}
 
 template void
     Cache::cache_forward<float>(Task const *task,
diff --git a/src/ops/cache.cu b/src/ops/cache.cu
index a113e57a1c..2f95e59669 100644
--- a/src/ops/cache.cu
+++ b/src/ops/cache.cu
@@ -74,7 +74,7 @@ float Cache::cache_update(Task const *task,
   return cache_score;
 }
 
-CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {}
+CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {}
 
 template void
     Cache::cache_forward<float>(Task const *task,
diff --git a/src/ops/cast.cc b/src/ops/cast.cc
index 2a845cb303..f182f16e00 100644
--- a/src/ops/cast.cc
+++ b/src/ops/cast.cc
@@ -190,7 +190,7 @@ OpMeta *Cast::init_task(Task const *task,
                         Runtime *runtime) {
   Cast *cast = (Cast *)task->args;
   FFHandler handler = *((FFHandler const *)task->local_args);
-  CastMeta *m = new CastMeta(handler);
+  CastMeta *m = new CastMeta(handler, cast);
   m->input_data_type = cast->inputs[0]->data_type;
   m->output_data_type = cast->outputs[0]->data_type;
   std::strcpy(m->op_name, cast->name);
diff --git a/src/ops/concat.cc b/src/ops/concat.cc
index 80935e387b..89e5e299c7 100644
--- a/src/ops/concat.cc
+++ b/src/ops/concat.cc
@@ -197,7 +197,7 @@ OpMeta *Concat::init_task(Task const *task,
                           Runtime *runtime) {
   Concat *cc = (Concat *)task->args;
   FFHandler handler = *((FFHandler const *)task->local_args);
-  ConcatMeta *m = new ConcatMeta(handler);
+  ConcatMeta *m = new ConcatMeta(handler, cc);
   // Note that our internal axis index ordering is opposite to other frameworks
   init_meta(m, cc->legion_axis);
   m->profiling = cc->profiling;
@@ -365,7 +365,7 @@ bool Concat::measure_operator_cost(Simulator *sim,
     }
   }
 
-  ConcatMeta *m = sim->concat_meta;
+  ConcatMeta *m = new ConcatMeta(sim->handler, this);
   init_meta(m, this->legion_axis);
 
   sim->free_all();
diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc
index 7c524c81de..e48fa9d794 100644
--- a/src/ops/conv_2d.cc
+++ b/src/ops/conv_2d.cc
@@ -588,7 +588,7 @@ OpMeta *Conv2D::init_task(Task const *task,
   //     regions[4], task->regions[4], FID_DATA, ctx, runtime,
   //     false/*readOutput*/);
 
-  Conv2DMeta *m = new Conv2DMeta(handle);
+  Conv2DMeta *m = new Conv2DMeta(handle, conv);
   m->relu = conv->activation == AC_MODE_RELU;
   m->use_bias = conv->use_bias;
   m->profiling = conv->profiling;
@@ -1113,7 +1113,7 @@ bool Conv2D::measure_operator_cost(Simulator *sim,
   int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
   int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
 
-  Conv2DMeta *m = sim->conv2d_meta;
+  Conv2DMeta *m = new Conv2DMeta(sim->handler, this);
   m->relu = activation == AC_MODE_RELU;
   // require input_c is divisible by groups
 
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index 9fb2e6dc1f..844aeb6de3 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -354,7 +354,7 @@ OpMeta *ElementUnary::init_task(Task const *task,
                                 Runtime *runtime) {
   ElementUnary *eu = (ElementUnary *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  ElementUnaryMeta *m = new ElementUnaryMeta(handle);
+  ElementUnaryMeta *m = new ElementUnaryMeta(handle, eu);
   m->op_type = eu->op_type;
   m->data_type = eu->outputs[0]->data_type;
   // Input and output should have the same data type
@@ -735,7 +735,7 @@ bool ElementUnary::measure_operator_cost(Simulator *sim,
   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
     return false;
   }
-  ElementUnaryMeta *m = sim->ele_unary_meta;
+  ElementUnaryMeta *m = new ElementUnaryMeta(sim->handler, this);
   m->op_type = op_type;
   if (use_cudnn(m->op_type)) {
     Domain input_domain, output_domain;
diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp
index e20200420f..435abdfe11 100644
--- a/src/ops/element_unary.cpp
+++ b/src/ops/element_unary.cpp
@@ -282,7 +282,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m,
                                    stream);
 }
 
-ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) {
+ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary)
+    : OpMeta(handler, unary) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu
index c7f5e90f4c..15e6852388 100644
--- a/src/ops/element_unary.cu
+++ b/src/ops/element_unary.cu
@@ -291,7 +291,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m,
                                    stream);
 }
 
-ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) {
+ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary)
+    : OpMeta(handler, unary) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index a1761f069d..963df195f7 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -582,18 +582,7 @@ OpMeta *Experts::init_task(Task const *task,
                            Runtime *runtime) {
   Experts const *exp = (Experts *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  ExpertsMeta *m = new ExpertsMeta(handle,
-                                   exp->num_experts,
-                                   exp->experts_start_idx,
-                                   exp->data_dim,
-                                   exp->out_dim,
-                                   exp->experts_num_layers,
-                                   exp->experts_internal_dim_size,
-                                   exp->effective_batch_size,
-                                   exp->num_chosen_experts,
-                                   exp->alpha,
-                                   exp->use_bias,
-                                   exp->activation);
+  ExpertsMeta *m = new ExpertsMeta(handle, exp);
   m->profiling = exp->profiling;
   m->inference_debugging = exp->inference_debugging;
   std::strcpy(m->op_name, exp->name);
diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp
index 48536defd9..502be878a9 100644
--- a/src/ops/experts.cpp
+++ b/src/ops/experts.cpp
@@ -35,25 +35,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   handle_unimplemented_hip_kernel(OP_EXPERTS);
 }
 
-ExpertsMeta::ExpertsMeta(FFHandler handler,
-                         int _num_experts,
-                         int _experts_start_idx,
-                         int _data_dim,
-                         int _out_dim,
-                         int _experts_num_layers,
-                         int _experts_internal_dim_size,
-                         int _effective_batch_size,
-                         int _num_chosen_experts,
-                         float _alpha,
-                         bool _use_bias,
-                         ActiMode _activation)
-    : OpMeta(handler), num_experts(_num_experts),
-      experts_start_idx(_experts_start_idx), data_dim(_data_dim),
-      out_dim(_out_dim), experts_num_layers(_experts_num_layers),
-      experts_internal_dim_size(_experts_internal_dim_size),
-      effective_batch_size(_effective_batch_size),
-      num_chosen_experts(_num_chosen_experts), alpha(_alpha),
-      use_bias(_use_bias), activation(_activation) {}
+ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e)
+    : OpMeta(handler, e), num_experts(e->num_experts),
+      experts_start_idx(e->experts_start_idx), data_dim(e->data_dim),
+      out_dim(e->out_dim), experts_num_layers(e->experts_num_layers),
+      experts_internal_dim_size(e->experts_internal_dim_size),
+      effective_batch_size(e->effective_batch_size),
+      num_chosen_experts(e->num_chosen_experts), alpha(e->alpha),
+      use_bias(e->use_bias), activation(e->activation) {}
+
 ExpertsMeta::~ExpertsMeta(void) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/experts.cu b/src/ops/experts.cu
index 4e3ef6f12c..6f0bd8afbb 100644
--- a/src/ops/experts.cu
+++ b/src/ops/experts.cu
@@ -1233,25 +1233,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
 }
 
-ExpertsMeta::ExpertsMeta(FFHandler handler,
-                         int _num_experts,
-                         int _experts_start_idx,
-                         int _data_dim,
-                         int _out_dim,
-                         int _experts_num_layers,
-                         int _experts_internal_dim_size,
-                         int _effective_batch_size,
-                         int _num_chosen_experts,
-                         float _alpha,
-                         bool _use_bias,
-                         ActiMode _activation)
-    : OpMeta(handler), num_experts(_num_experts),
-      experts_start_idx(_experts_start_idx), data_dim(_data_dim),
-      out_dim(_out_dim), experts_num_layers(_experts_num_layers),
-      experts_internal_dim_size(_experts_internal_dim_size),
-      effective_batch_size(_effective_batch_size),
-      num_chosen_experts(_num_chosen_experts), alpha(_alpha),
-      use_bias(_use_bias), activation(_activation) {
+ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e)
+    : OpMeta(handler, e), num_experts(e->num_experts),
+      experts_start_idx(e->experts_start_idx), data_dim(e->data_dim),
+      out_dim(e->out_dim), experts_num_layers(e->experts_num_layers),
+      experts_internal_dim_size(e->experts_internal_dim_size),
+      effective_batch_size(e->effective_batch_size),
+      num_chosen_experts(e->num_chosen_experts), alpha(e->alpha),
+      use_bias(e->use_bias), activation(e->activation) {
   expert_capacity =
       ceil(alpha * num_chosen_experts / num_experts * effective_batch_size);
 
diff --git a/src/ops/flat.cc b/src/ops/flat.cc
index 669c457709..37a86cde2a 100644
--- a/src/ops/flat.cc
+++ b/src/ops/flat.cc
@@ -186,7 +186,8 @@ OpMeta *Flat::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handler = *((FFHandler const *)task->local_args);
-  FlatMeta *m = new FlatMeta(handler);
+  Flat *flat = (Flat *)task->args;
+  FlatMeta *m = new FlatMeta(handler, flat);
   return m;
 }
 
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index 50871983f5..75960e7dcd 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -264,7 +264,7 @@ OpMeta *Group_by::init_task(Task const *task,
                             Runtime *runtime) {
   Group_by *gb = (Group_by *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha);
+  GroupByMeta *m = new GroupByMeta(handle, gb);
   m->profiling = gb->profiling;
   m->inference_debugging = gb->inference_debugging;
   std::strcpy(m->op_name, gb->name);
@@ -565,7 +565,7 @@ bool Group_by::measure_operator_cost(Simulator *sim,
     }
   }
 
-  GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha);
+  GroupByMeta *m = new GroupByMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp
index 761c35f182..9ca6f77898 100644
--- a/src/ops/group_by.cpp
+++ b/src/ops/group_by.cpp
@@ -188,9 +188,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m,
                      data_dim);
 }
 
-GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha)
-    : OpMeta(handler), alpha(_alpha) {
-  checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *)));
+GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb)
+    : OpMeta(handler, gb), alpha(gb->alpha) {
+  checkCUDA(hipMalloc(&dev_region_ptrs, gb->n * sizeof(float *)));
 }
 GroupByMeta::~GroupByMeta(void) {
   checkCUDA(hipFree(&dev_region_ptrs));
diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu
index 0ed09e20b3..43bcb900df 100644
--- a/src/ops/group_by.cu
+++ b/src/ops/group_by.cu
@@ -198,9 +198,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m,
   }
 }
 
-GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha)
-    : OpMeta(handler), alpha(_alpha) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
+GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb)
+    : OpMeta(handler, gb), alpha(gb->alpha) {
+  checkCUDA(cudaMalloc(&dev_region_ptrs, gb->n * sizeof(float *)));
 }
 GroupByMeta::~GroupByMeta(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp
index 7145af2108..8eeede65c7 100644
--- a/src/ops/kernels/batch_matmul.cpp
+++ b/src/ops/kernels/batch_matmul.cpp
@@ -13,13 +13,15 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/kernels/batch_matmul_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {}
+BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm)
+    : OpMeta(handler, bmm) {}
 
 namespace Kernels {
 namespace BatchMatmul {
diff --git a/src/ops/kernels/batch_matmul.cu b/src/ops/kernels/batch_matmul.cu
index ac280db1a4..97f13fa5a8 100644
--- a/src/ops/kernels/batch_matmul.cu
+++ b/src/ops/kernels/batch_matmul.cu
@@ -13,12 +13,14 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/kernels/batch_matmul_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {}
+BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm)
+    : OpMeta(handler, bmm) {}
 
 namespace Kernels {
 namespace BatchMatmul {
diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp
index 16b9b4cec0..1e561959f1 100644
--- a/src/ops/kernels/cast_kernels.cpp
+++ b/src/ops/kernels/cast_kernels.cpp
@@ -14,12 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/cast_kernels.h"
+#include "flexflow/ops/cast.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {}
+CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {}
 
 namespace Kernels {
 namespace Cast {
diff --git a/src/ops/kernels/cast_kernels.cu b/src/ops/kernels/cast_kernels.cu
index a96f37dbbd..fdce63b9f1 100644
--- a/src/ops/kernels/cast_kernels.cu
+++ b/src/ops/kernels/cast_kernels.cu
@@ -13,12 +13,13 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/cast.h"
 #include "flexflow/ops/kernels/cast_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {}
+CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {}
 
 namespace Kernels {
 namespace Cast {
diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp
index bf5d46b9cc..6c05e0143c 100644
--- a/src/ops/kernels/concat_kernels.cpp
+++ b/src/ops/kernels/concat_kernels.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/concat_kernels.h"
+#include "flexflow/ops/concat.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -23,6 +24,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Rect;
 
+ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc)
+    : OpMeta(handler, cc) {}
+
 namespace Kernels {
 namespace Concat {
 
diff --git a/src/ops/kernels/concat_kernels.cu b/src/ops/kernels/concat_kernels.cu
index f625560625..2569c36b21 100644
--- a/src/ops/kernels/concat_kernels.cu
+++ b/src/ops/kernels/concat_kernels.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/concat.h"
 #include "flexflow/ops/kernels/concat_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
@@ -22,6 +23,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Rect;
 
+ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc)
+    : OpMeta(handler, cc) {}
+
 namespace Kernels {
 namespace Concat {
 
diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp
index 7d2fa20c49..b7406f641d 100644
--- a/src/ops/kernels/conv_2d_kernels.cpp
+++ b/src/ops/kernels/conv_2d_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/kernels/conv_2d_kernels.h"
+#include "flexflow/ops/conv_2d.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) {
+Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv)
+    : OpMeta(handler, conv) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&biasTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu
index 6c0fd85496..65dc38f142 100644
--- a/src/ops/kernels/conv_2d_kernels.cu
+++ b/src/ops/kernels/conv_2d_kernels.cu
@@ -1,9 +1,11 @@
+#include "flexflow/ops/conv_2d.h"
 #include "flexflow/ops/kernels/conv_2d_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) {
+Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv)
+    : OpMeta(handler, conv) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp
index 14225f0bce..c8b1887fd4 100644
--- a/src/ops/kernels/dropout_kernels.cpp
+++ b/src/ops/kernels/dropout_kernels.cpp
@@ -28,7 +28,7 @@ DropoutMeta::DropoutMeta(FFHandler handler,
                          Dropout const *dropout,
                          Memory gpu_mem,
                          Domain const &output_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu
index e142bba83b..d65b951f51 100644
--- a/src/ops/kernels/dropout_kernels.cu
+++ b/src/ops/kernels/dropout_kernels.cu
@@ -27,7 +27,7 @@ DropoutMeta::DropoutMeta(FFHandler handler,
                          Dropout const *dropout,
                          Memory gpu_mem,
                          Domain const &output_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
diff --git a/src/ops/kernels/flat_kernels.cpp b/src/ops/kernels/flat_kernels.cpp
index be48854fc0..6815ce7492 100644
--- a/src/ops/kernels/flat_kernels.cpp
+++ b/src/ops/kernels/flat_kernels.cpp
@@ -14,11 +14,15 @@
  */
 
 #include "flexflow/ops/kernels/flat_kernels.h"
+#include "flexflow/ops/flat.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
+FlatMeta::FlatMeta(FFHandler handler, Flat const *flat)
+    : OpMeta(handler, flat) {}
+
 namespace Kernels {
 namespace Flat {
 
diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu
index 3836c02c94..fc0c0270c1 100644
--- a/src/ops/kernels/flat_kernels.cu
+++ b/src/ops/kernels/flat_kernels.cu
@@ -13,11 +13,15 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/flat.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
+FlatMeta::FlatMeta(FFHandler handler, Flat const *flat)
+    : OpMeta(handler, flat) {}
+
 namespace Kernels {
 namespace Flat {
 
diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp
index 8af85612ca..b3f20a35dd 100644
--- a/src/ops/kernels/pool_2d_kernels.cpp
+++ b/src/ops/kernels/pool_2d_kernels.cpp
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
+#include "flexflow/ops/pool_2d.h"
 #include "flexflow/utils/hip_helper.h"
 
 namespace FlexFlow {
 
-Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) {
+Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool)
+    : OpMeta(handler, pool) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc));
diff --git a/src/ops/kernels/pool_2d_kernels.cu b/src/ops/kernels/pool_2d_kernels.cu
index b418d20cd3..c236f049ba 100644
--- a/src/ops/kernels/pool_2d_kernels.cu
+++ b/src/ops/kernels/pool_2d_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
+#include "flexflow/ops/pool_2d.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) {
+Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool)
+    : OpMeta(handler, pool) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
diff --git a/src/ops/kernels/reshape_kernels.cpp b/src/ops/kernels/reshape_kernels.cpp
index b17d95bfea..47f407fd82 100644
--- a/src/ops/kernels/reshape_kernels.cpp
+++ b/src/ops/kernels/reshape_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/kernels/reshape_kernels.h"
+#include "flexflow/ops/reshape.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {}
+ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape)
+    : OpMeta(handler, reshape) {}
 
 namespace Kernels {
 namespace Reshape {
diff --git a/src/ops/kernels/reshape_kernels.cu b/src/ops/kernels/reshape_kernels.cu
index 9786f63815..0a2b01ae52 100644
--- a/src/ops/kernels/reshape_kernels.cu
+++ b/src/ops/kernels/reshape_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/reshape_kernels.h"
+#include "flexflow/ops/reshape.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {}
+ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape)
+    : OpMeta(handler, reshape) {}
 
 namespace Kernels {
 namespace Reshape {
diff --git a/src/ops/kernels/transpose_kernels.cpp b/src/ops/kernels/transpose_kernels.cpp
index 49a7d827f5..199e1cd0c1 100644
--- a/src/ops/kernels/transpose_kernels.cpp
+++ b/src/ops/kernels/transpose_kernels.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/transpose_kernels.h"
+#include "flexflow/ops/transpose.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -22,6 +23,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Domain;
 
+TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose)
+    : OpMeta(handler, transpose) {}
+
 struct TransposeStrides {
   int num_dim;
   int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM],
diff --git a/src/ops/kernels/transpose_kernels.cu b/src/ops/kernels/transpose_kernels.cu
index b401ff0ba1..18a6e405af 100644
--- a/src/ops/kernels/transpose_kernels.cu
+++ b/src/ops/kernels/transpose_kernels.cu
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/transpose_kernels.h"
+#include "flexflow/ops/transpose.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -21,6 +22,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Domain;
 
+TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose)
+    : OpMeta(handler, transpose) {}
+
 struct TransposeStrides {
   int num_dim;
   int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM],
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index e9f8feae2b..40c575532f 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -882,7 +882,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
   }
   Domain input_domain = sub_input.get_domain();
   Domain output_domain = sub_output.get_domain();
-  LayerNormMeta *m = sim->layernorm_meta;
+  MemoryAllocator gpu_mem_allocator(sim->memory);
+  LayerNormMeta *m = new LayerNormMeta(sim->handler, this, gpu_mem_allocator);
 
   sim->free_all();
   float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 9beb655d1d..2736dbf507 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32;
 LayerNormMeta::LayerNormMeta(FFHandler handle,
                              LayerNorm const *ln,
                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   effective_batch_size = ln->effective_batch_size;
   effective_num_elements = ln->effective_num_elements;
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index e242904775..b105ef0ea8 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32;
 LayerNormMeta::LayerNormMeta(FFHandler handle,
                              LayerNorm const *ln,
                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 05529a46ec..f8181570ce 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -1219,7 +1219,10 @@ bool Linear::measure_operator_cost(Simulator *sim,
   int input_n = sub_input.get_volume() / input_c;
   int output_c = sub_output.dims[0].size;
   int output_n = sub_output.get_volume() / output_c;
-  LinearMeta *m = sim->linear_meta;
+
+  MemoryAllocator gpu_mem_allocator(sim->memory);
+  LinearMeta *m = new LinearMeta(
+      sim->handler, output_n, this, gpu_mem_allocator, input_c * output_c);
   m->activation = activation;
   m->kernel_reg_type = kernel_reg_type;
   m->kernel_reg_lambda = kernel_reg_lambda;
diff --git a/src/ops/mean.cc b/src/ops/mean.cc
index b2ec94fdf8..0d41276735 100644
--- a/src/ops/mean.cc
+++ b/src/ops/mean.cc
@@ -87,8 +87,7 @@ OpMeta *Mean::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handler = *((FFHandler const *)task->local_args);
-  OpMeta *m = new OpMeta(handler);
-  return m;
+  return nullptr;
 }
 
 void Mean::forward(FFModel const &ff) {}
diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index da2d4922e3..dabdf835dd 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -91,8 +91,8 @@ OpMeta *NoOp::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handle = *((FFHandler const *)task->local_args);
-  OpMeta *m = new OpMeta(handle);
-  return m;
+  // OpMeta *m = new OpMeta(handle);
+  return nullptr;
 }
 
 void NoOp::init_inference(FFModel const &ff,
diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc
index e358448ddf..46722bd943 100644
--- a/src/ops/pool_2d.cc
+++ b/src/ops/pool_2d.cc
@@ -315,7 +315,7 @@ OpMeta *Pool2D::init_task(Task const *task,
   assert(task->regions.size() == 2);
   Pool2D const *pool = (Pool2D *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Pool2DMeta *m = new Pool2DMeta(handle);
+  Pool2DMeta *m = new Pool2DMeta(handle, pool);
   m->profiling = pool->profiling;
   m->inference_debugging = pool->inference_debugging;
   std::strcpy(m->op_name, pool->name);
@@ -543,7 +543,7 @@ bool Pool2D::measure_operator_cost(Simulator *sim,
   int output_n = sub_output.dims[3].size;
   int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
   int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
-  Pool2DMeta *m = sim->pool2d_meta;
+  Pool2DMeta *m = new Pool2DMeta(sim->handler, this);
 
   init_kernel(m,
               input_w,
diff --git a/src/ops/reduce.cpp b/src/ops/reduce.cpp
index c062955ed6..fe122b13eb 100644
--- a/src/ops/reduce.cpp
+++ b/src/ops/reduce.cpp
@@ -25,7 +25,7 @@ using Legion::Domain;
 ReduceMeta::ReduceMeta(FFHandler handler,
                        Reduce const *rd,
                        Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, rd) {
   checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc));
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/reduce.cu b/src/ops/reduce.cu
index 65efd90e9b..1352787a12 100644
--- a/src/ops/reduce.cu
+++ b/src/ops/reduce.cu
@@ -24,7 +24,7 @@ using Legion::Domain;
 ReduceMeta::ReduceMeta(FFHandler handler,
                        Reduce const *rd,
                        Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, rd) {
   checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc
index 45da190680..04aea12c5f 100644
--- a/src/ops/reshape.cc
+++ b/src/ops/reshape.cc
@@ -180,7 +180,7 @@ OpMeta *Reshape::init_task(Task const *task,
                            Runtime *runtime) {
   Reshape const *reshape = (Reshape *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  ReshapeMeta *m = new ReshapeMeta(handle);
+  ReshapeMeta *m = new ReshapeMeta(handle, reshape);
   std::strcpy(m->op_name, reshape->name);
   m->layer_guid = reshape->layer_guid;
   m->data_type = reshape->outputs[0]->data_type;
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index f1b7a537b0..72370ab979 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -27,7 +27,7 @@ constexpr int kCUDANumThreads = 256;
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
                                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   use_two_residuals = ln->use_two_residuals;
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index e5ebdce6ed..ea77f01f53 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -26,7 +26,7 @@ constexpr int kCUDANumThreads = 256;
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
                                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   use_two_residuals = ln->use_two_residuals;
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 7b7f30a288..0f48bf8126 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -23,7 +23,7 @@ namespace FlexFlow {
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
                                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
 }
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 590b641b5a..ea63dd5508 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -22,7 +22,7 @@ namespace FlexFlow {
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
                                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
 }
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index b38ff85f90..48da6bf341 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -223,7 +223,7 @@ OpMeta *TopK::init_task(Task const *task,
                         Runtime *runtime) {
   TopK *topk = (TopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  TopKMeta *m = new TopKMeta(handle);
+  TopKMeta *m = new TopKMeta(handle, topk);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
   m->sorted = topk->sorted;
@@ -464,7 +464,7 @@ bool TopK::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  TopKMeta *m = new TopKMeta(sim->handler);
+  TopKMeta *m = new TopKMeta(sim->handler, this);
   m->sorted = sorted;
 
   // allocate
diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp
index b6e898b654..303c6e85e9 100644
--- a/src/ops/topk.cpp
+++ b/src/ops/topk.cpp
@@ -513,6 +513,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m,
   // TODO: missing profiling here
 }
 
-TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {}
+TopKMeta::TopKMeta(FFHandler handler, TopK const *topk)
+    : OpMeta(handler, topk) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/topk.cu b/src/ops/topk.cu
index cc87ee8a42..cfb2bf6448 100644
--- a/src/ops/topk.cu
+++ b/src/ops/topk.cu
@@ -509,6 +509,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m,
   }
 }
 
-TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {}
+TopKMeta::TopKMeta(FFHandler handler, TopK const *topk)
+    : OpMeta(handler, topk) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc
index 500b7867af..bea10c9d2a 100644
--- a/src/ops/transpose.cc
+++ b/src/ops/transpose.cc
@@ -190,7 +190,7 @@ OpMeta *Transpose::init_task(Task const *task,
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
 
-  TransposeMeta *m = new TransposeMeta(handle);
+  TransposeMeta *m = new TransposeMeta(handle, transpose);
   transpose->init_meta(m, in_domain, out_domain);
   m->profiling = transpose->profiling;
   m->inference_debugging = transpose->inference_debugging;
@@ -317,7 +317,7 @@ bool Transpose::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  TransposeMeta *m = sim->transpose_meta;
+  TransposeMeta *m = new TransposeMeta(sim->handler, this);
   this->init_meta(m, sub_input.get_domain(), sub_output.get_domain());
 
   sim->free_all();
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 7c266c5392..8411b42602 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -99,7 +99,7 @@ OpMeta *Combine::init_task(Task const *task,
                            Runtime *runtime) {
   Combine *cmb = (Combine *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  CombineMeta *m = new CombineMeta(handle);
+  CombineMeta *m = new CombineMeta(handle, cmb);
   m->input_type[0] = cmb->inputs[0]->data_type;
   m->output_type[0] = cmb->outputs[0]->data_type;
   assert(m->input_type[0] == m->output_type[0]);
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp
index 8d7e20e395..fbb11fc705 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cpp
+++ b/src/parallel_ops/kernels/allreduce_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace AllReduce {
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 5861f05d7a..1801ac8784 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace AllReduce {
diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp
index d6e9568223..2a29be1ad4 100644
--- a/src/parallel_ops/kernels/combine_kernels.cpp
+++ b/src/parallel_ops/kernels/combine_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/parallel_ops/kernels/combine_kernels.h"
+#include "flexflow/parallel_ops/combine.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {}
+CombineMeta::CombineMeta(FFHandler handler, Combine const *comb)
+    : OpMeta(handler, comb) {}
 
 namespace Kernels {
 namespace Combine {
diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu
index 1ab79a7944..5809e2d4f3 100644
--- a/src/parallel_ops/kernels/combine_kernels.cu
+++ b/src/parallel_ops/kernels/combine_kernels.cu
@@ -13,12 +13,14 @@
  * limitations under the License.
  */
 
+#include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/kernels/combine_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {}
+CombineMeta::CombineMeta(FFHandler handler, Combine const *comb)
+    : OpMeta(handler, comb) {}
 
 namespace Kernels {
 namespace Combine {
diff --git a/src/parallel_ops/kernels/partition_kernels.cpp b/src/parallel_ops/kernels/partition_kernels.cpp
index cfd76c0f18..bd1c96d4c7 100644
--- a/src/parallel_ops/kernels/partition_kernels.cpp
+++ b/src/parallel_ops/kernels/partition_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/parallel_ops/kernels/partition_kernels.h"
+#include "flexflow/parallel_ops/partition.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {}
+RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart)
+    : OpMeta(handler, repart) {}
 
 namespace Kernels {
 namespace Repartition {
diff --git a/src/parallel_ops/kernels/partition_kernels.cu b/src/parallel_ops/kernels/partition_kernels.cu
index 08008f1035..3a39b39fe4 100644
--- a/src/parallel_ops/kernels/partition_kernels.cu
+++ b/src/parallel_ops/kernels/partition_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/parallel_ops/kernels/partition_kernels.h"
+#include "flexflow/parallel_ops/partition.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {}
+RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart)
+    : OpMeta(handler, repart) {}
 
 namespace Kernels {
 namespace Repartition {
diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp
index 2a3fe5cca1..1f3e8e0962 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cpp
+++ b/src/parallel_ops/kernels/reduction_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace Reduction {
diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu
index 34ae8007da..df7630976b 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cu
+++ b/src/parallel_ops/kernels/reduction_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace Reduction {
diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp
index 1647f014be..f49e0d4eb0 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cpp
+++ b/src/parallel_ops/kernels/replicate_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl)
-    : OpMeta(handle) {}
+    : OpMeta(handle, repl) {}
 
 namespace Kernels {
 namespace Replicate {
diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu
index 35bc109bd3..0b5c434aa6 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cu
+++ b/src/parallel_ops/kernels/replicate_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl)
-    : OpMeta(handle) {}
+    : OpMeta(handle, repl) {}
 
 namespace Kernels {
 namespace Replicate {
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 0f71291ded..81a72a5c12 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -600,6 +600,23 @@ void FFModel::compile_inference() {
       assert(op->outputs[i]->parallel_tensor_guid != 0);
     }
   }
+
+  // Check whether we need to reset input grads
+  // We use a parallel tensor's region as the key
+  std::set<LogicalRegion> reset_inputs;
+  for (int l = operators.size() - 1; l >= 0; l--) {
+    Op *op = operators[l];
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i]->region != LogicalRegion::NO_REGION);
+      if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) {
+        // We should not reset input grads since other operators have already
+        // saved gradients into the region
+        op->reset_input_grads[i] = false;
+      } else {
+        reset_inputs.insert(op->inputs[i]->region);
+      }
+    }
+  }
   // Perform fusion optimizations
   if (config.perform_fusion) {
     fprintf(stderr, "Applying fusion optimizations during compilation...\n");
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 04a847b023..82cf538f93 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1465,6 +1465,7 @@ bool Op::get_weight_parameter(TNParameter tnp,
   return true;
 }
 
+#ifdef DEADCODE
 OpMeta::OpMeta(FFHandler _handle)
     : handle(_handle), profiling(false), inference_debugging(false) {
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
@@ -1482,8 +1483,14 @@ OpMeta::OpMeta(FFHandler _handle)
   }
   decoding_step = 0;
 }
+#endif
 
-OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) {
+OpMeta::OpMeta(FFHandler _handle, Op const *op)
+    : profiling(op->profiling), inference_debugging(op->inference_debugging) {
+  for (int i = 0; i < op->numInputs; i++) {
+    trainable_inputs[i] = op->trainable_inputs[i];
+    reset_input_grads[i] = op->reset_input_grads[i];
+  }
   for (int i = 0; i < op->numInputs; i++) {
     input_type[i] = op->inputs[i]->data_type;
   }
diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp
index 0daf151d2c..56931e0dc7 100644
--- a/src/runtime/simulator.cpp
+++ b/src/runtime/simulator.cpp
@@ -82,17 +82,17 @@ Simulator::Simulator(FFModel const *model,
 
   checkCUDA(hipEventCreate(&start_event));
   checkCUDA(hipEventCreate(&end_event));
-  conv2d_meta = new Conv2DMeta(handler);
-  // linear_meta = new LinearMeta(handler, 4096);
-  pool2d_meta = new Pool2DMeta(handler);
-  ele_unary_meta = new ElementUnaryMeta(handler);
-  // ele_binary_meta = new ElementBinaryMeta(handler);
-  // embedding_meta = new EmbeddingMeta(handler);
-  //  softmax_meta = new SoftmaxMeta(handler);
-  batch_matmul_meta = new BatchMatmulMeta(handler);
-  concat_meta = new ConcatMeta(handler);
-  // dropout_meta = new DropoutMeta(handler);
-  transpose_meta = new TransposeMeta(handler);
+  // conv2d_meta = new Conv2DMeta(handler);
+  //  linear_meta = new LinearMeta(handler, 4096);
+  // pool2d_meta = new Pool2DMeta(handler);
+  // ele_unary_meta = new ElementUnaryMeta(handler);
+  //  ele_binary_meta = new ElementBinaryMeta(handler);
+  //  embedding_meta = new EmbeddingMeta(handler);
+  //   softmax_meta = new SoftmaxMeta(handler);
+  // batch_matmul_meta = new BatchMatmulMeta(handler);
+  // concat_meta = new ConcatMeta(handler);
+  //  dropout_meta = new DropoutMeta(handler);
+  // transpose_meta = new TransposeMeta(handler);
   this->machine = machine;
   segment_size = model->config.simulator_segment_size;
   max_num_segments = model->config.simulator_max_num_segments;
diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu
index b44ce1690a..056781f73d 100644
--- a/src/runtime/simulator.cu
+++ b/src/runtime/simulator.cu
@@ -81,17 +81,17 @@ Simulator::Simulator(FFModel const *model,
 
   cudaEventCreate(&start_event);
   cudaEventCreate(&end_event);
-  conv2d_meta = new Conv2DMeta(handler);
+  // conv2d_meta = new Conv2DMeta(handler);
   // linear_meta = new LinearMeta(handler, 4096);
-  pool2d_meta = new Pool2DMeta(handler);
-  ele_unary_meta = new ElementUnaryMeta(handler);
+  // pool2d_meta = new Pool2DMeta(handler);
+  // ele_unary_meta = new ElementUnaryMeta(handler);
   // ele_binary_meta = new ElementBinaryMeta(handler);
   // embedding_meta = new EmbeddingMeta(handler);
   // softmax_meta = new SoftmaxMeta(handler);
-  batch_matmul_meta = new BatchMatmulMeta(handler);
-  concat_meta = new ConcatMeta(handler);
+  // batch_matmul_meta = new BatchMatmulMeta(handler);
+  // concat_meta = new ConcatMeta(handler);
   // dropout_meta = new DropoutMeta(handler);
-  transpose_meta = new TransposeMeta(handler);
+  // transpose_meta = new TransposeMeta(handler);
   this->machine = machine;
   segment_size = model->config.simulator_segment_size;
   max_num_segments = model->config.simulator_max_num_segments;
@@ -103,13 +103,13 @@ Simulator::~Simulator(void) {
   simulatorInst.destroy();
   cudaEventDestroy(start_event);
   cudaEventDestroy(end_event);
-  delete conv2d_meta;
-  delete pool2d_meta;
-  delete ele_unary_meta;
-  delete batch_matmul_meta;
-  delete concat_meta;
-  delete transpose_meta;
-  delete task_manager;
+  // delete conv2d_meta;
+  // delete pool2d_meta;
+  // delete ele_unary_meta;
+  // delete batch_matmul_meta;
+  // delete concat_meta;
+  // delete transpose_meta;
+  // delete task_manager;
 }
 
 __host__ void

From eb14798b929083dd8e68a44af15132b69f00fef5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 18 Oct 2023 12:51:54 -0400
Subject: [PATCH 025/198] residual rms norm backward

---
 include/flexflow/model.h                      |   2 +
 .../ops/kernels/residual_rms_norm_kernels.h   |  10 +
 include/flexflow/ops/residual_rms_norm.h      |   4 +
 src/ops/kernels/residual_rms_norm_kernels.cu  | 181 ++++++++++++++++++
 src/ops/residual_rms_norm.cc                  | 131 ++++++++++++-
 5 files changed, 327 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index ac24e90900..30d125a542 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -171,6 +171,8 @@ enum TaskIDs {
   RMSNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
+  RESIDUAL_RMSNORM_BWD_TASK_ID,
+  RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
   BEAM_TOPK_INIT_TASK_ID,
   BEAM_TOPK_INF_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 0eef4ca72b..26a5686f0b 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -48,6 +48,16 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
                             GenericTensorAccessorW const &output);
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorR const &residual_input0,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorR const &residual_input1,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
index 0d92a236e8..11750c1f6d 100644
--- a/include/flexflow/ops/residual_rms_norm.h
+++ b/include/flexflow/ops/residual_rms_norm.h
@@ -74,6 +74,10 @@ class ResidualRMSNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 17ac14449b..75dee4808c 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -79,6 +79,23 @@ __inline__ __device__ T WarpReduceSum(T val) {
   return val;
 }
 
+template <typename T>
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
+  int const lid = threadIdx.x % C10_WARP_SIZE;
+  int const wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceSum(val);
+  }
+  return val;
+}
+
 template <typename T>
 __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
@@ -219,6 +236,170 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ T ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += dY[index] * X[index] * gamma[j];
+  }
+  ds = BlockReduceSum<T>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast<T>((int)N);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX1,
+                                          T *dX2) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    T dX_val = c1[i] * dY[index] * gamma[j] + c2[i] * X[index];
+    dX1[index] += dX_val;
+    dX2[index] += dX_val;
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(ResidualRMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *residual_output_rms_input_ptr,
+                     T const *residual_input0_ptr,
+                     T *residual_input0_grad_ptr,
+                     T const *residual_input1_ptr,
+                     T *residual_input1_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->batch_size;
+  const int64_t N = m->num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          residual_output_rms_input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T>
+      <<<M, kCUDANumThreads, 0, stream>>>(N,
+                                          output_grad_ptr,
+                                          residual_output_rms_input_ptr,
+                                          weight_ptr,
+                                          static_cast<T *>(m->rms_ptr),
+                                          static_cast<T *>(m->norm_ptr),
+                                          residual_input0_grad_ptr,
+                                          residual_input1_grad_ptr);
+  const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+  GammaBackwardCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          output_grad_ptr,
+                                          residual_output_rms_input_ptr,
+                                          static_cast<T *>(m->rms_ptr),
+                                          weight_grad_ptr);
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I): Residual input 0
+  regions[3](I/O): Residual input 0 grad
+  regions[4](I): Residual input 1
+  regions[5](I/O): Residual input 1 grad
+  regions[6](I): weight
+  regions[7](I/O): weight_grad
+*/
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorR const &residual_input0,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorR const &residual_input1,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(output_grad.data_type == residual_output_rms_input.data_type);
+  assert(residual_output_rms_input.data_type == residual_input0.data_type);
+  assert(residual_input0.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1.data_type);
+  assert(residual_input1.data_type == residual_input1_grad.data_type);
+  assert(residual_input1_grad.data_type == weight.data_type);
+  assert(weight.data_type == weight_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    residual_output_rms_input.get_half_ptr(),
+                    residual_input0.get_half_ptr(),
+                    residual_input0_grad.get_half_ptr(),
+                    residual_input1.get_half_ptr(),
+                    residual_input1_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    residual_output_rms_input.get_float_ptr(),
+                    residual_input0.get_float_ptr(),
+                    residual_input0_grad.get_float_ptr(),
+                    residual_input1.get_float_ptr(),
+                    residual_input1_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index b447a2a3b5..d382f05394 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -485,8 +485,137 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
 }
 
 void ResidualRMSNorm::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RESIDUAL_RMSNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): RMS output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I): residual output / RMS input
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I): residual input 0
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I/O): residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I): residual input 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region));
+  launcher.add_field(4, FID_DATA);
+  // regions[5](I/O): residual input grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(5, FID_DATA);
+  // regions[3](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(6, FID_DATA);
+  // regions[4](I/O): gamma_grad
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(7, FID_DATA);
+
+  runtime->execute_index_space(ctx, launcher);
 }
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I): Residual input 0
+  regions[3](I/O): Residual input 0 grad
+  regions[4](I): Residual input 1
+  regions[5](I/O): Residual input 1 grad
+  regions[6](I): weight
+  regions[7](I/O): weight_grad
+*/
+void ResidualRMSNorm::backward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  assert(task->regions.size() == 8);
+  assert(regions.size() == 8);
+  ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW residual_output_rms_input =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR residual_input0 = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW residual_input0_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[3],
+                                       task->regions[3],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR residual_input1 = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW residual_input1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[5],
+                                       task->regions[5],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[6], task->regions[6], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[7], task->regions[7], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(m,
+                          output_grad,
+                          residual_output_rms_input,
+                          residual_input0,
+                          residual_input0_grad,
+                          residual_input1,
+                          residual_input1_grad,
+                          weight,
+                          weight_grad);
+}
+
 Op *ResidualRMSNorm::materialize(FFModel &ff,
                                  ParallelTensor inputs[],
                                  int num_inputs) const {

From e7fa9cee3b97b6aa7519338b1237398a0c7d2fa1 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 18 Oct 2023 16:48:30 -0400
Subject: [PATCH 026/198] cleanup

---
 .../ops/kernels/residual_rms_norm_kernels.h   |  2 -
 src/ops/kernels/residual_rms_norm_kernels.cu  | 14 +----
 src/ops/residual_rms_norm.cc                  | 62 ++++++-------------
 3 files changed, 22 insertions(+), 56 deletions(-)

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 26a5686f0b..75dcfc945f 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -52,9 +52,7 @@ void backward_kernel_wrapper(
     ResidualRMSNormMeta const *m,
     GenericTensorAccessorR const &output_grad,
     GenericTensorAccessorR const &residual_output_rms_input,
-    GenericTensorAccessorR const &residual_input0,
     GenericTensorAccessorW const &residual_input0_grad,
-    GenericTensorAccessorR const &residual_input1,
     GenericTensorAccessorW const &residual_input1_grad,
     GenericTensorAccessorR const &weight,
     GenericTensorAccessorW const &weight_grad);
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 75dee4808c..2fc4cc95c2 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -290,9 +290,7 @@ template <typename T>
 void backward_kernel(ResidualRMSNormMeta const *m,
                      T const *output_grad_ptr,
                      T const *residual_output_rms_input_ptr,
-                     T const *residual_input0_ptr,
                      T *residual_input0_grad_ptr,
-                     T const *residual_input1_ptr,
                      T *residual_input1_grad_ptr,
                      T const *weight_ptr,
                      T *weight_grad_ptr,
@@ -341,9 +339,7 @@ void backward_kernel_wrapper(
     ResidualRMSNormMeta const *m,
     GenericTensorAccessorR const &output_grad,
     GenericTensorAccessorR const &residual_output_rms_input,
-    GenericTensorAccessorR const &residual_input0,
     GenericTensorAccessorW const &residual_input0_grad,
-    GenericTensorAccessorR const &residual_input1,
     GenericTensorAccessorW const &residual_input1_grad,
     GenericTensorAccessorR const &weight,
     GenericTensorAccessorW const &weight_grad) {
@@ -356,10 +352,8 @@ void backward_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
   assert(output_grad.data_type == residual_output_rms_input.data_type);
-  assert(residual_output_rms_input.data_type == residual_input0.data_type);
-  assert(residual_input0.data_type == residual_input0_grad.data_type);
-  assert(residual_input0_grad.data_type == residual_input1.data_type);
-  assert(residual_input1.data_type == residual_input1_grad.data_type);
+  assert(residual_output_rms_input.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
   assert(residual_input1_grad.data_type == weight.data_type);
   assert(weight.data_type == weight_grad.data_type);
 
@@ -367,9 +361,7 @@ void backward_kernel_wrapper(
     backward_kernel(m,
                     output_grad.get_half_ptr(),
                     residual_output_rms_input.get_half_ptr(),
-                    residual_input0.get_half_ptr(),
                     residual_input0_grad.get_half_ptr(),
-                    residual_input1.get_half_ptr(),
                     residual_input1_grad.get_half_ptr(),
                     weight.get_half_ptr(),
                     weight_grad.get_half_ptr(),
@@ -378,9 +370,7 @@ void backward_kernel_wrapper(
     backward_kernel(m,
                     output_grad.get_float_ptr(),
                     residual_output_rms_input.get_float_ptr(),
-                    residual_input0.get_float_ptr(),
                     residual_input0_grad.get_float_ptr(),
-                    residual_input1.get_float_ptr(),
                     residual_input1_grad.get_float_ptr(),
                     weight.get_float_ptr(),
                     weight_grad.get_float_ptr(),
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index d382f05394..1e0b652163 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -511,48 +511,34 @@ void ResidualRMSNorm::backward(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
   launcher.add_field(1, FID_DATA);
-  // regions[2](I): residual input 0
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  // regions[3](I/O): residual input grad 0
+  // regions[2](I/O): residual input grad 0
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
                                                     inputs[0]->region_grad));
-  launcher.add_field(3, FID_DATA);
-  // regions[4](I): residual input 1
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(4, FID_DATA);
-  // regions[5](I/O): residual input grad 1
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I/O): residual input grad 1
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
                                                     inputs[1]->region_grad));
-  launcher.add_field(5, FID_DATA);
-  // regions[3](I): gamma
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I): gamma
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(6, FID_DATA);
-  // regions[4](I/O): gamma_grad
+  launcher.add_field(4, FID_DATA);
+  // regions[5](I/O): gamma_grad
   launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
                                                     weights[0]->region_grad));
-  launcher.add_field(7, FID_DATA);
+  launcher.add_field(5, FID_DATA);
 
   runtime->execute_index_space(ctx, launcher);
 }
@@ -560,19 +546,17 @@ void ResidualRMSNorm::backward(FFModel const &ff) {
 /*
   regions[0](I): RMS output_grad
   regions[1](I): Residual output / RMS input
-  regions[2](I): Residual input 0
-  regions[3](I/O): Residual input 0 grad
-  regions[4](I): Residual input 1
-  regions[5](I/O): Residual input 1 grad
-  regions[6](I): weight
-  regions[7](I/O): weight_grad
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
 */
 void ResidualRMSNorm::backward_task(Task const *task,
                                     std::vector<PhysicalRegion> const &regions,
                                     Context ctx,
                                     Runtime *runtime) {
-  assert(task->regions.size() == 8);
-  assert(regions.size() == 8);
+  assert(task->regions.size() == 6);
+  assert(regions.size() == 6);
   ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
@@ -583,34 +567,28 @@ void ResidualRMSNorm::backward_task(Task const *task,
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR residual_input0 = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorW residual_input0_grad =
       helperGetGenericTensorAccessorRW(m->input_type[0],
-                                       regions[3],
-                                       task->regions[3],
+                                       regions[2],
+                                       task->regions[2],
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR residual_input1 = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
   GenericTensorAccessorW residual_input1_grad =
       helperGetGenericTensorAccessorRW(m->input_type[0],
-                                       regions[5],
-                                       task->regions[5],
+                                       regions[3],
+                                       task->regions[3],
                                        FID_DATA,
                                        ctx,
                                        runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[6], task->regions[6], FID_DATA, ctx, runtime);
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
   GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
-      m->weight_type[0], regions[7], task->regions[7], FID_DATA, ctx, runtime);
+      m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime);
   backward_kernel_wrapper(m,
                           output_grad,
                           residual_output_rms_input,
-                          residual_input0,
                           residual_input0_grad,
-                          residual_input1,
                           residual_input1_grad,
                           weight,
                           weight_grad);

From 5f7f71082b24b324412e6456fee031d8fa94d223 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 18 Oct 2023 18:03:05 -0400
Subject: [PATCH 027/198] bug fix

---
 src/runtime/request_manager.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bdb87df051..c0573a50a3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1214,7 +1214,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
           request.tokens.size() - 1;
 
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.num_tokens++;
       new_bc.requestsInfo[i].num_tokens_in_batch++;
 

From 7b2bd0874b67f87a4e3c724f93c3054f6475770f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 19 Oct 2023 00:16:18 -0400
Subject: [PATCH 028/198] finished peft bwd for residual rms norm

---
 .../ops/kernels/residual_rms_norm_kernels.h   |  15 ++
 include/flexflow/ops/residual_rms_norm.h      |   9 +
 src/ops/kernels/residual_rms_norm_kernels.cu  | 203 +++++++++++++++++-
 src/ops/layer_norm.cc                         |   2 +-
 src/ops/layer_norm.cu                         |  73 ++++---
 src/ops/residual_rms_norm.cc                  |  90 +++++++-
 src/runtime/model.cc                          |  30 +++
 7 files changed, 379 insertions(+), 43 deletions(-)

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 75dcfc945f..4fbe34f83f 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_OPS_KERNELS_RESIDUAL_RMSNORM_KERNELS_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -38,6 +39,8 @@ class ResidualRMSNormMeta : public OpMeta {
   int batch_size;
   int num_elements;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
 };
 
 namespace Kernels {
@@ -48,6 +51,13 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
                             GenericTensorAccessorW const &output);
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output);
 void backward_kernel_wrapper(
     ResidualRMSNormMeta const *m,
     GenericTensorAccessorR const &output_grad,
@@ -56,6 +66,11 @@ void backward_kernel_wrapper(
     GenericTensorAccessorW const &residual_input1_grad,
     GenericTensorAccessorR const &weight,
     GenericTensorAccessorW const &weight_grad);
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &residual_input0_grad,
+                             GenericTensorAccessorW const &residual_input1_grad,
+                             GenericTensorAccessorR const &weight);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
index 11750c1f6d..de6e6ea506 100644
--- a/include/flexflow/ops/residual_rms_norm.h
+++ b/include/flexflow/ops/residual_rms_norm.h
@@ -44,6 +44,11 @@ class ResidualRMSNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -78,6 +83,10 @@ class ResidualRMSNorm : public Op {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 2fc4cc95c2..53804c0b1b 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -236,6 +236,116 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  assert(input1.data_type == input2.data_type);
+  assert(output.data_type == input1.data_type);
+  assert(weight.data_type == output.data_type);
+  assert(residual_output.data_type == output.data_type);
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->input_activation = allocator->allocate_instance_untyped(
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    residual_output.get_float_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    data_type_size(m->input_type[0]) *
+                                        num_peft_tokens * in_dim,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    residual_output.get_half_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    data_type_size(m->input_type[0]) *
+                                        num_peft_tokens * in_dim,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input1.get_half_ptr(),
+                   input2.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   residual_output.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input1.get_float_ptr(),
+                   input2.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   residual_output.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 template <typename T>
 __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
@@ -325,15 +435,44 @@ void backward_kernel(ResidualRMSNormMeta const *m,
                                           weight_grad_ptr);
 }
 
+template <typename T>
+void peft_bwd_kernel(ResidualRMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *residual_input0_grad_ptr,
+                     T *residual_input1_grad_ptr,
+                     T const *weight_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->batch_size;
+  const int64_t N = m->num_elements;
+  T const *residual_output_rms_input_ptr =
+      static_cast<T *>(m->input_activation);
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          residual_output_rms_input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T>
+      <<<M, kCUDANumThreads, 0, stream>>>(N,
+                                          output_grad_ptr,
+                                          residual_output_rms_input_ptr,
+                                          weight_ptr,
+                                          static_cast<T *>(m->rms_ptr),
+                                          static_cast<T *>(m->norm_ptr),
+                                          residual_input0_grad_ptr,
+                                          residual_input1_grad_ptr);
+}
+
 /*
   regions[0](I): RMS output_grad
   regions[1](I): Residual output / RMS input
-  regions[2](I): Residual input 0
-  regions[3](I/O): Residual input 0 grad
-  regions[4](I): Residual input 1
-  regions[5](I/O): Residual input 1 grad
-  regions[6](I): weight
-  regions[7](I/O): weight_grad
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
 */
 void backward_kernel_wrapper(
     ResidualRMSNormMeta const *m,
@@ -390,6 +529,58 @@ void backward_kernel_wrapper(
   }
 }
 
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I/O): Residual input 0 grad
+  regions[2](I/O): Residual input 1 grad
+  regions[3](I): weight
+*/
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &residual_input0_grad,
+                             GenericTensorAccessorW const &residual_input1_grad,
+                             GenericTensorAccessorR const &weight) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(output_grad.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
+  assert(residual_input1_grad.data_type == weight.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    residual_input0_grad.get_half_ptr(),
+                    residual_input1_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    residual_input0_grad.get_float_ptr(),
+                    residual_input1_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 40c575532f..0a467f0984 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -668,7 +668,7 @@ Legion::FutureMap
                          Predicate::TRUE_PRED,
                          false /*must*/,
                          0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
+                         machine_view_hash);
   // regions[0](I): output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index b105ef0ea8..6e12c53230 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -261,53 +261,56 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
 
   // save input activation if needed for PEFT
   if (bc->num_active_peft_tokens() > 0) {
-    // check that at most one dimension after the first is > 1. TODO(goliaro):
-    // support case where this condition does not hold
-    int non_unit_dims_encountered = 0;
-    for (int i = 1; i < input.domain.get_dim(); i++) {
-      int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1;
-      if (dim_i > 1) {
-        non_unit_dims_encountered++;
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
       }
     }
-    assert(non_unit_dims_encountered <= 1);
-
-    // allocate space for all peft tokens
-    MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-    int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-    m->input_activation = allocator->allocate_instance_untyped(
-        data_type_size(m->input_type[0]) * bc->num_active_peft_tokens() *
-        in_dim);
+    assert(num_peft_requests <= 1);
 
     int tokens_previous_requests = 0;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
-      // Skip non-PEFT requests and PEFT forward-only requests
-      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
-          !bc->requestsInfo[i].peft_bwd) {
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
         tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-
-      if (m->input_type[0] == DT_FLOAT) {
-        checkCUDA(cudaMemcpyAsync(
-            m->input_activation,
-            input.get_float_ptr() + tokens_previous_requests * in_dim,
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
-            cudaMemcpyDeviceToDevice,
-            stream));
-      } else if (m->input_type[0] == DT_HALF) {
-        checkCUDA(cudaMemcpyAsync(
-            m->input_activation,
-            input.get_half_ptr() + tokens_previous_requests * in_dim,
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
-            cudaMemcpyDeviceToDevice,
-            stream));
-      } else {
-        assert(false && "unsupport datatype in layernorm");
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->input_activation = allocator->allocate_instance_untyped(
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
       }
     }
   }
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 1e0b652163..07137726d1 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -444,7 +444,8 @@ void ResidualRMSNorm::inference_task(Task const *task,
       m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output);
+  inference_kernel_wrapper(
+      m, bc, input1, input2, weight, residual_output, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -594,6 +595,93 @@ void ResidualRMSNorm::backward_task(Task const *task,
                           weight_grad);
 }
 
+Legion::FutureMap
+    ResidualRMSNorm::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  // regions[0](I): RMS output_grad
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  // regions[2](I/O): residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[3](I/O): residual input grad 1
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  // regions[4](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(3, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I/O): Residual input 0 grad
+  regions[2](I/O): Residual input 1 grad
+  regions[3](I): weight
+*/
+void ResidualRMSNorm::peft_bwd_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  assert(task->regions.size() == 4);
+  assert(regions.size() == 4);
+  ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW residual_input0_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_input1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[2],
+                                       task->regions[2],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(
+      m, output_grad, residual_input0_grad, residual_input1_grad, weight);
+}
+
 Op *ResidualRMSNorm::materialize(FFModel &ff,
                                  ParallelTensor inputs[],
                                  int num_inputs) const {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 82cf538f93..a1b5b07d8d 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5433,6 +5433,36 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_BWD_TASK_ID,
+                                   "Residual RMS Norm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualRMSNorm::backward_task>(
+          registrar, "RMS Norm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualRMSNorm::backward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                                   "Residual RMS Norm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualRMSNorm::peft_bwd_task>(
+          registrar, "RMS Norm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualRMSNorm::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID,
                                    "layernorm_peft_bwd_task");

From d2f177d36af88254ae9f40df0098cf07a49aa222 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 19 Oct 2023 15:38:40 -0400
Subject: [PATCH 029/198] sigmoid_silu_multi backward and peft_bwd

---
 include/flexflow/model.h                  |   2 +
 include/flexflow/ops/sigmoid_silu_multi.h |  32 ++-
 src/ops/fused.cu                          |   3 +-
 src/ops/sigmoid_silu_multi.cc             | 170 +++++++++++++++-
 src/ops/sigmoid_silu_multi.cu             | 232 +++++++++++++++++++++-
 src/runtime/model.cc                      |  32 +++
 6 files changed, 466 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 30d125a542..4e863952cc 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -113,6 +113,8 @@ enum TaskIDs {
   ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
   SIGMOID_SILU_MULTI_INIT_TASK_ID,
   SIGMOID_SILU_MULTI_INF_TASK_ID,
+  SIGMOID_SILU_MULTI_BWD_TASK_ID,
+  SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
   LINEAR_INIT_TASK_ID,
   LINEAR_INIT_PARA_TASK_ID,
   LINEAR_INF_TASK_ID,
diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
index 604438260a..28e3bfed3e 100644
--- a/include/flexflow/ops/sigmoid_silu_multi.h
+++ b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/memory_allocator.h"
@@ -27,6 +28,11 @@ class SigmoidSiluMulti : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -55,6 +61,14 @@ class SigmoidSiluMulti : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -65,10 +79,24 @@ class SigmoidSiluMulti : public Op {
                                T const *input2_ptr,
                                T *output_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+  static void inference_kernel_wrapper(SigmoidSiluMultiMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input1,
                                        GenericTensorAccessorR const &input2,
                                        GenericTensorAccessorW const &output);
+  static void
+      backward_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorW const &input1_grad,
+                              GenericTensorAccessorW const &input2_grad);
+  static void
+      peft_bwd_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW const &input1_grad,
+                              GenericTensorAccessorW const &input2_grad);
 };
 
 class SigmoidSiluMultiMeta : public OpMeta {
@@ -80,6 +108,8 @@ class SigmoidSiluMultiMeta : public OpMeta {
 
 public:
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 692316c6d4..b9ce88e02c 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -617,8 +617,9 @@ __host__ void
       case OP_SIGMOID_SILU_MULTI: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op];
         SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                   bc,
                                                    my_input_accessor[0],
                                                    my_input_accessor[1],
                                                    my_output_accessor[0]);
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3b2ed7cef4..e36eb36d31 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -254,7 +254,173 @@ void SigmoidSiluMulti::forward(FFModel const &ff) {
 }
 
 void SigmoidSiluMulti::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(SIGMOID_SILU_MULTI_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // output grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // input 1
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // input 2
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  // input 1 grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(3, FID_DATA);
+  // input 2 grad
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(4, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output grad
+  regions[1](I): input 1
+  regions[2](I): input 2
+  regions[3](I/O): input 1 grad
+  regions[4](I/O): input 2 grad
+*/
+void SigmoidSiluMulti::backward_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 5);
+
+  SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+
+  SigmoidSiluMulti::backward_kernel_wrapper(
+      m, output_grad, input1, input2, input1_grad, input2_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    SigmoidSiluMulti::save_inference_tensors_to_file(
+        m,
+        shard_id,
+        nullptr,
+        {output_grad, input1, input2},
+        {},
+        {input1_grad, input2_grad});
+  }
+}
+
+FutureMap
+    SigmoidSiluMulti::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // output grad
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  // input 1 grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // input 2 grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output grad
+  regions[3](I/O): input 1 grad
+  regions[4](I/O): input 2 grad
+*/
+void SigmoidSiluMulti::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 3);
+
+  SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() <= 0) {
+    return;
+  }
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+
+  SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+      m, bc, output_grad, input1_grad, input2_grad);
 }
 
 FutureMap SigmoidSiluMulti::inference(
@@ -347,7 +513,7 @@ void SigmoidSiluMulti::inference_task(
   assert(input1_domain == input2_domain);
   assert(input1_domain == output_domain);
 
-  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output);
+  SigmoidSiluMulti::inference_kernel_wrapper(m, bc, input1, input2, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index ea63dd5508..597f7ecdab 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -45,9 +45,34 @@ __global__ void SigmoidSiluMultiKernel(int num_elements,
   }
 }
 
+template <typename T>
+__global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
+                                               T const *output_grad_ptr,
+                                               T const *input1_ptr,
+                                               T const *input2_ptr,
+                                               T *input1_grad_ptr,
+                                               T *input2_grad_ptr) {
+  CUDA_KERNEL_LOOP(i, num_elements) {
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+
+    T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
+    input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val);
+
+    input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    T sig_grad = ss_grad_val * input1_ptr[i];
+
+    float x1_grad_val = static_cast<float>(sig_grad);
+    x1_grad_val = exp(-x1_grad_val) /
+                  ((1.0f + exp(-sigmoid_val)) * (1.0f + exp(-sigmoid_val)));
+    input1_grad_ptr[i] += T(x1_grad_val);
+  }
+}
+
 /*static*/
 void SigmoidSiluMulti::inference_kernel_wrapper(
-    SigmoidSiluMultiMeta const *m,
+    SigmoidSiluMultiMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
     GenericTensorAccessorW const &output) {
@@ -64,6 +89,77 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        size_t input_tensor_size =
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
+        m->input_activation =
+            allocator->allocate_instance_untyped(2 * input_tensor_size);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    input1.get_float_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    input_tensor_size,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_float_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    input1.get_half_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    input_tensor_size,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_half_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
@@ -95,4 +191,138 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void SigmoidSiluMulti::backward_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input1,
+    GenericTensorAccessorR const &input2,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(output_grad.domain.get_volume(),
+                                               output_grad.get_float_ptr(),
+                                               input1.get_float_ptr(),
+                                               input2.get_float_ptr(),
+                                               input1_grad.get_float_ptr(),
+                                               input1_grad.get_float_ptr());
+  } else if (m->input_type[0] == DT_HALF) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(output_grad.domain.get_volume(),
+                                               output_grad.get_half_ptr(),
+                                               input1.get_half_ptr(),
+                                               input2.get_half_ptr(),
+                                               input1_grad.get_half_ptr(),
+                                               input2_grad.get_half_ptr());
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  int num_peft_requests = 0;
+  int num_peft_tokens = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+      num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  assert(num_peft_requests == 1);
+  assert(num_peft_tokens >= 1);
+  int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+
+  if (m->input_type[0] == DT_FLOAT) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(
+        output_grad.domain.get_volume(),
+        output_grad.get_float_ptr(),
+        static_cast<float const *>(m->input_activation),
+        static_cast<float const *>(m->input_activation) +
+            num_peft_tokens * in_dim,
+        input1_grad.get_float_ptr(),
+        input1_grad.get_float_ptr());
+  } else if (m->input_type[0] == DT_HALF) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(
+        output_grad.domain.get_volume(),
+        output_grad.get_half_ptr(),
+        static_cast<half const *>(m->input_activation),
+        static_cast<half const *>(m->input_activation) +
+            num_peft_tokens * in_dim,
+        input1_grad.get_half_ptr(),
+        input2_grad.get_half_ptr());
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index a1b5b07d8d..3ab1049f4a 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5328,6 +5328,38 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_BWD_TASK_ID,
+                                   "SigmoidSiluMulti Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<SigmoidSiluMulti::backward_task>(
+          registrar, "SigmoidSiluMulti Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<SigmoidSiluMulti::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
+                                   "SigmoidSiluMulti PEFT Bwd");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<SigmoidSiluMulti::peft_bwd_task>(
+          registrar, "SigmoidSiluMulti PEFT Bwd Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<SigmoidSiluMulti::peft_bwd_task>(
+          registrar);
+    }
+  }
   // rms norm task
   {
     TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task");

From 8b1f76b2c03652d1fb0d977ef488df2613d6f79b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 19 Oct 2023 15:54:00 -0400
Subject: [PATCH 030/198] hip_rocm update

---
 src/ops/sigmoid_silu_multi.cpp | 263 ++++++++++++++++++++++++++++++---
 1 file changed, 242 insertions(+), 21 deletions(-)

diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 0f48bf8126..ccd622ff17 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -34,36 +34,46 @@ SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) {
   }
 }
 
-__device__ __forceinline__ float sigmoid_float(float x) {
-  return 1.0 / (1.0 + expf(-x));
-}
-
-__device__ __forceinline__ half sigmoid_half(half x) {
-  return (half)1.0 / ((half)1.0 + hexp(-x));
-}
-
-__global__ void SigmoidSiluMultiKernelFloat(int num_elements,
-                                            float const *input1_ptr,
-                                            float const *input2_ptr,
-                                            float *output_ptr) {
+template <typename T>
+__global__ void SigmoidSiluMultiKernel(int num_elements,
+                                       T const *input1_ptr,
+                                       T const *input2_ptr,
+                                       T *output_ptr) {
   CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] =
-        input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i];
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+    output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i];
   }
 }
 
-__global__ void SigmoidSiluMultiKernelHalf(int num_elements,
-                                           half const *input1_ptr,
-                                           half const *input2_ptr,
-                                           half *output_ptr) {
+template <typename T>
+__global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
+                                               T const *output_grad_ptr,
+                                               T const *input1_ptr,
+                                               T const *input2_ptr,
+                                               T *input1_grad_ptr,
+                                               T *input2_grad_ptr) {
   CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i];
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+
+    T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
+    input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val);
+
+    input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    T sig_grad = ss_grad_val * input1_ptr[i];
+
+    float x1_grad_val = static_cast<float>(sig_grad);
+    x1_grad_val = exp(-x1_grad_val) /
+                  ((1.0f + exp(-sigmoid_val)) * (1.0f + exp(-sigmoid_val)));
+    input1_grad_ptr[i] += T(x1_grad_val);
   }
 }
 
 /*static*/
 void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
     GenericTensorAccessorW const &output) {
@@ -81,8 +91,78 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        size_t input_tensor_size =
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
+        m->input_activation =
+            allocator->allocate_instance_untyped(2 * input_tensor_size);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(m->input_activation,
+                                   input1.get_float_ptr() +
+                                       tokens_previous_requests * in_dim,
+                                   input_tensor_size,
+                                   hipMemcpyDeviceToDevice,
+                                   stream));
+          checkCUDA(hipMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_float_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(m->input_activation,
+                                   input1.get_half_ptr() +
+                                       tokens_previous_requests * in_dim,
+                                   input_tensor_size,
+                                   hipMemcpyDeviceToDevice,
+                                   stream));
+          checkCUDA(hipMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_half_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel),
                        GET_BLOCKS(num_elements),
                        min(CUDA_NUM_THREADS, num_elements),
                        0,
@@ -92,7 +172,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                        input2.get_float_ptr(),
                        output.get_float_ptr());
   } else if (m->input_type[0] == DT_HALF) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel),
                        GET_BLOCKS(num_elements),
                        min(CUDA_NUM_THREADS, num_elements),
                        0,
@@ -116,4 +196,145 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void SigmoidSiluMulti::backward_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input1,
+    GenericTensorAccessorR const &input2,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_float_ptr(),
+                       input1.get_float_ptr(),
+                       input2.get_float_ptr(),
+                       input1_grad.get_float_ptr(),
+                       input1_grad.get_float_ptr());
+  } else if (m->input_type[0] == DT_HALF) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_half_ptr(),
+                       input1.get_half_ptr(),
+                       input2.get_half_ptr(),
+                       input1_grad.get_half_ptr(),
+                       input2_grad.get_half_ptr());
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  int num_peft_requests = 0;
+  int num_peft_tokens = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+      num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  assert(num_peft_requests == 1);
+  assert(num_peft_tokens >= 1);
+  int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+
+  if (m->input_type[0] == DT_FLOAT) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_float_ptr(),
+                       static_cast<float const *>(m->input_activation),
+                       static_cast<float const *>(m->input_activation) +
+                           num_peft_tokens * in_dim,
+                       input1_grad.get_float_ptr(),
+                       input1_grad.get_float_ptr());
+  } else if (m->input_type[0] == DT_HALF) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_half_ptr(),
+                       static_cast<half const *>(m->input_activation),
+                       static_cast<half const *>(m->input_activation) +
+                           num_peft_tokens * in_dim,
+                       input1_grad.get_half_ptr(),
+                       input2_grad.get_half_ptr());
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow

From 84c391bfd0f9c026c43e933c0c5915a84d43119f Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Fri, 20 Oct 2023 00:31:35 -0400
Subject: [PATCH 031/198] support peft_bwd for fused layers

---
 config/config.linux |  2 +-
 src/ops/fused.cu    | 31 ++++++++++++++-----------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/config/config.linux b/config/config.linux
index dbf3d3dd01..3686237538 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -13,7 +13,7 @@
 #INSTALL_DIR=
 
 # set build type
-BUILD_TYPE=${BUILD_TYPE:-Debug}
+BUILD_TYPE=${BUILD_TYPE:-Release}
 
 INFERENCE_TESTS=${INFERENCE_TESTS:-OFF}
 LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"}
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index b9ce88e02c..eaf1831beb 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1007,18 +1007,16 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
       }
       case OP_RESIDUAL_RMS_NORM: {
         // TODO: implement me
-        assert(false);
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 2);
-        // ResidualRMSNormMeta const *m = (ResidualRMSNormMeta
-        // *)metas->meta[op];
-        // Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-        //                                                  my_input_accessor[0],
-        //                                                  my_input_accessor[1],
-        //                                                  my_weight_accessor[0],
-        //                                                  my_output_accessor[0],
-        //                                                  my_output_accessor[1]);
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta*)metas->meta[op];
+        Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[0],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            my_weight_accessor[0]);
         break;
       }
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -1152,14 +1150,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
       case OP_SIGMOID_SILU_MULTI: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
-        // SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta
-        // *)metas->meta[op];
-        //  TODO: implement me
-        assert(false);
-        // SigmoidSiluMulti::inference_kernel_wrapper(m,
-        //                                            my_input_accessor[0],
-        //                                            my_input_accessor[1],
-        //                                            my_output_accessor[0]);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta*)metas->meta[op];
+        SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            my_output_grad_accessor[0],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1]);
         break;
       }
       case OP_SOFTMAX: {

From 1cc723e3dfa0d9448a9d10224fb0927264b82292 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Fri, 20 Oct 2023 00:32:11 -0400
Subject: [PATCH 032/198] format

---
 src/ops/fused.cu | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index eaf1831beb..3030b23830 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1010,7 +1010,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta*)metas->meta[op];
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
         Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
             m,
             my_output_grad_accessor[0],
@@ -1150,13 +1150,12 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
       case OP_SIGMOID_SILU_MULTI: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta*)metas->meta[op];
-        SigmoidSiluMulti::peft_bwd_kernel_wrapper(
-            m,
-            bc,
-            my_output_grad_accessor[0],
-            my_input_grad_accessor[0],
-            my_input_grad_accessor[1]);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_input_grad_accessor[1]);
         break;
       }
       case OP_SOFTMAX: {

From f1d5dc0ba0e66c5fc3a263a590dee6ccbe33f253 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 21 Oct 2023 06:21:36 -0400
Subject: [PATCH 033/198] residual layer norm bwd / peft_bwd

---
 include/flexflow/model.h                   |   4 +
 include/flexflow/ops/residual_layer_norm.h |  36 +-
 src/ops/fused.cu                           |   4 +-
 src/ops/residual_layer_norm.cc             | 293 ++++++++++-
 src/ops/residual_layer_norm.cu             | 556 ++++++++++++++++++++-
 src/ops/residual_rms_norm.cc               |   8 +-
 src/runtime/model.cc                       |  32 ++
 7 files changed, 923 insertions(+), 10 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 4e863952cc..5d986c1329 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -109,8 +109,12 @@ enum TaskIDs {
   LAYERNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_LAYERNORM_INIT_TASK_ID,
   RESIDUAL_LAYERNORM_INF_TASK_ID,
+  RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   SIGMOID_SILU_MULTI_INIT_TASK_ID,
   SIGMOID_SILU_MULTI_INF_TASK_ID,
   SIGMOID_SILU_MULTI_BWD_TASK_ID,
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index 0e9be82125..35ddb171d4 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -40,6 +40,11 @@ class ResidualLayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -65,6 +70,14 @@ class ResidualLayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -78,7 +91,8 @@ class ResidualLayerNorm : public Op {
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(ResidualLayerNormMeta const *m,
+  static void inference_kernel_wrapper(ResidualLayerNormMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &residual1,
                                        GenericTensorAccessorR const &residual2,
@@ -86,6 +100,24 @@ class ResidualLayerNorm : public Op {
                                        GenericTensorAccessorW &output,
                                        GenericTensorAccessorR const &gamma,
                                        GenericTensorAccessorR const &beta);
+  static void
+      backward_kernel_wrapper(ResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR const &added_output,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorW const &residual1_grad,
+                              GenericTensorAccessorW const &residual2_grad,
+                              GenericTensorAccessorR const &gamma,
+                              GenericTensorAccessorW const &gamma_grad,
+                              GenericTensorAccessorW const &beta_grad);
+
+  static void
+      peft_bwd_kernel_wrapper(ResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorW const &residual1_grad,
+                              GenericTensorAccessorW const &residual2_grad,
+                              GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias, use_two_residuals;
@@ -107,6 +139,8 @@ class ResidualLayerNormMeta : public OpMeta {
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 3030b23830..255136099a 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -538,8 +538,7 @@ __host__ void
       }
       case OP_RESIDUAL_LAYERNORM: {
         assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
+        ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op];
         if (m->use_two_residuals) {
           assert(fused->op_num_inputs[op] == 3);
         } else {
@@ -566,6 +565,7 @@ __host__ void
           }
         }
         ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                    bc,
                                                     my_input_accessor[0],
                                                     my_input_accessor[1],
                                                     residual2,
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 7de40fb389..ce82ec6702 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -516,7 +516,296 @@ void ResidualLayerNorm::forward(FFModel const &ff) {
 }
 
 void ResidualLayerNorm::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // added output
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (use_two_residuals) {
+    // residual grad 2
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      inputs[2]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+    // gamma_grad
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+    if (use_bias) {
+      // beta_grad
+      launcher.add_region_requirement(
+          RegionRequirement(weights[1]->part_grad,
+                            0 /*projection id*/,
+                            READ_WRITE,
+                            EXCLUSIVE,
+                            weights[1]->region_grad));
+      launcher.add_field(field_id++, FID_DATA);
+    }
+  }
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void ResidualLayerNorm::backward_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  ResidualLayerNormMeta const *m =
+      *((ResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         4 + m->use_two_residuals +
+             (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR added_output =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual2_grad;
+  if (m->use_two_residuals) {
+    GenericTensorAccessorW residual2_grad =
+        helperGetGenericTensorAccessorRW(m->input_type[2],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad =
+        helperGetGenericTensorAccessorRW(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+    if (m->use_bias) {
+      beta_grad =
+          helperGetGenericTensorAccessorRW(m->output_type[0],
+                                           regions[region_idx++],
+                                           task->regions[task_region_idx++],
+                                           FID_DATA,
+                                           ctx,
+                                           runtime);
+    }
+  }
+  ResidualLayerNorm::backward_kernel_wrapper(m,
+                                             output_grad,
+                                             added_output,
+                                             input_grad,
+                                             residual1_grad,
+                                             residual2_grad,
+                                             gamma,
+                                             gamma_grad,
+                                             beta_grad);
+}
+
+Legion::FutureMap ResidualLayerNorm::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (use_two_residuals) {
+    // residual grad 2
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      inputs[2]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void ResidualLayerNorm::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  ResidualLayerNormMeta const *m =
+      *((ResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         4 + m->use_two_residuals +
+             (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual2_grad;
+  if (m->use_two_residuals) {
+    GenericTensorAccessorW residual2_grad =
+        helperGetGenericTensorAccessorRW(m->input_type[2],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  GenericTensorAccessorR gamma;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+  }
+  ResidualLayerNorm::peft_bwd_kernel_wrapper(
+      m, output_grad, input_grad, residual1_grad, residual2_grad, gamma);
 }
 
 Op *ResidualLayerNorm::materialize(FFModel &ff,
@@ -734,7 +1023,7 @@ void ResidualLayerNorm::inference_task(
          m->effective_num_elements * m->effective_batch_size);
 
   ResidualLayerNorm::inference_kernel_wrapper(
-      m, input, residual1, residual2, added_output, output, gamma, beta);
+      m, bc, input, residual1, residual2, added_output, output, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index ea77f01f53..4bfac1887f 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -22,6 +22,7 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
@@ -73,6 +74,23 @@ __inline__ __device__ T WarpReduceSum(T val) {
   return val;
 }
 
+template <typename T>
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
+  int const lid = threadIdx.x % C10_WARP_SIZE;
+  int const wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceSum(val);
+  }
+  return val;
+}
+
 template <typename T>
 __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
@@ -186,7 +204,8 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
-    ResidualLayerNormMeta const *m,
+    ResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &residual1,
     GenericTensorAccessorR const &residual2,
@@ -203,6 +222,63 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim =
+          added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->input_activation = allocator->allocate_instance_untyped(
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
     ResidualLayerNorm::inference_kernel<float>(
         m,
@@ -240,4 +316,482 @@ void ResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    dX_i[l] += f_grad_input;
+    dX_residual1_i[l] += f_grad_input;
+    if (dX_residual2 != nullptr) {
+      dX_residual2_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+}
+
+/*static*/
+template <typename T>
+void backward_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *added_output_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     T *gamma_grad_ptr,
+                     T *beta_grad_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          added_output_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      added_output_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual1_grad_ptr,
+      residual2_grad_ptr,
+      N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              added_output_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              added_output_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void ResidualLayerNorm::backward_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &added_output,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual1_grad.get_float_ptr(),
+        m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual1_grad.get_half_ptr(),
+        m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void peft_bwd_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          static_cast<T const *>(m->input_activation),
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T const *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual1_grad_ptr,
+      residual2_grad_ptr,
+      N);
+}
+
+/*static*/
+void ResidualLayerNorm::peft_bwd_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual1_grad.get_float_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_float_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual1_grad.get_half_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_half_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 07137726d1..e2bc29635a 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -499,11 +499,11 @@ void ResidualRMSNorm::backward(FFModel const &ff) {
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
   // regions[0](I): RMS output_grad
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
-                                                    outputs[0]->region_grad));
+                                                    outputs[1]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[1](I): residual output / RMS input
   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
@@ -617,11 +617,11 @@ Legion::FutureMap
                          0 /*mapper_id*/,
                          machine_view_hash);
   // regions[0](I): RMS output_grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+                                                    batch_outputs[1]->region));
   launcher.add_field(0, FID_DATA);
   // regions[2](I/O): residual input grad 0
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 3ab1049f4a..8939e9e74d 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5259,6 +5259,38 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                                   "residual_layernorm_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualLayerNorm::backward_task>(
+          registrar, "residual_layernorm_backward_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualLayerNorm::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "residual_layernorm_peft_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualLayerNorm::peft_bwd_task>(
+          registrar, "residual_layernorm_peft_bwd_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualLayerNorm::peft_bwd_task>(
+          registrar);
+    }
+  }
   // AddBiasResidualLayerNorm task
   {
     TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,

From 3b50e17b7964ac920511df2f12e06a2de6f766ca Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 21 Oct 2023 20:18:25 -0400
Subject: [PATCH 034/198] fix typo

---
 src/ops/kernels/conv_2d_kernels.cpp    | 6 +++---
 src/ops/kernels/conv_2d_kernels.cu     | 6 +++---
 src/ops/kernels/linear_kernels.cpp     | 8 ++++----
 src/ops/kernels/linear_kernels.cu      | 8 ++++----
 src/ops/kernels/lora_linear_kernels.cu | 8 ++++----
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp
index b7406f641d..85a94ad6be 100644
--- a/src/ops/kernels/conv_2d_kernels.cpp
+++ b/src/ops/kernels/conv_2d_kernels.cpp
@@ -328,7 +328,7 @@ void backward_kernel(Conv2DMeta const *m,
                        output_ptr,
                        n * c * h * w);
   }
-  // Compute filter gradiant
+  // Compute filter gradient
   // NOTE: we use alpha for kernel_grad to accumulate gradients
   checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn,
                                               &alpha,
@@ -343,7 +343,7 @@ void backward_kernel(Conv2DMeta const *m,
                                               kernel_grad_ptr,
                                               m->handle.workSpace,
                                               m->handle.workSpaceSize));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha for bias_grad to accumulate gradients
   if (bias_grad_ptr != NULL) {
     checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn,
@@ -354,7 +354,7 @@ void backward_kernel(Conv2DMeta const *m,
                                              m->biasTensor,
                                              bias_grad_ptr));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn,
diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu
index 65dc38f142..661acdf732 100644
--- a/src/ops/kernels/conv_2d_kernels.cu
+++ b/src/ops/kernels/conv_2d_kernels.cu
@@ -311,7 +311,7 @@ void backward_kernel(Conv2DMeta const *m,
     reluBackward<<<GET_BLOCKS(n * c * h * w), CUDA_NUM_THREADS, 0, stream>>>(
         output_grad_ptr, output_ptr, n * c * h * w);
   }
-  // Compute filter gradiant
+  // Compute filter gradient
   // NOTE: we use alpha for kernel_grad to accumulate gradients
   checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn,
                                             &alpha,
@@ -326,7 +326,7 @@ void backward_kernel(Conv2DMeta const *m,
                                             &alpha,
                                             m->filterDesc,
                                             kernel_grad_ptr));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha for bias_grad to accumulate gradients
   if (bias_grad_ptr != NULL) {
     checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn,
@@ -337,7 +337,7 @@ void backward_kernel(Conv2DMeta const *m,
                                             m->biasTensor,
                                             bias_grad_ptr));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn,
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 87b39126c5..e92cc77f3a 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -393,7 +393,7 @@ void peft_bwd_kernel(LinearMeta const *m,
     assert(m->activation == AC_MODE_NONE);
   }
 
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(hipblasGemmEx(m->handle.blas,
@@ -455,7 +455,7 @@ void backward_kernel(LinearMeta const *m,
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
   }
-  // Compute weight gradiant
+  // Compute weight gradient
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_N,
@@ -476,7 +476,7 @@ void backward_kernel(LinearMeta const *m,
                           in_dim,
                           compute_type,
                           HIPBLAS_GEMM_DEFAULT));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {
@@ -500,7 +500,7 @@ void backward_kernel(LinearMeta const *m,
                             compute_type,
                             HIPBLAS_GEMM_DEFAULT));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(hipblasGemmEx(m->handle.blas,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index edf3cdaf07..0aa6661187 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -479,7 +479,7 @@ void peft_bwd_kernel(LinearMeta const *m,
     assert(m->activation == AC_MODE_NONE);
   }
 
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
@@ -542,7 +542,7 @@ void backward_kernel(LinearMeta const *m,
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
   }
-  // Compute weight gradiant
+  // Compute weight gradient
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_N,
@@ -583,7 +583,7 @@ void backward_kernel(LinearMeta const *m,
     assert(false && "Only L2 regularization is supported");
   }
 
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {
@@ -607,7 +607,7 @@ void backward_kernel(LinearMeta const *m,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index eab98a24e7..a3fc071f11 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -295,7 +295,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     LoraLinearWeight weight =
         m->model_weights[bc->requestsInfo[i].peft_model_id];
     int rank = weight.rank;
-    // Compute w1's gradiant
+    // Compute w1's gradient
     // NOTE: we use alpha=1 for w1_grad to accumulate gradients
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
@@ -316,7 +316,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            rank,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // Compute gradiants w.r.t. low_rank activation
+    // Compute gradients w.r.t. low_rank activation
     // and save the results to low_rank_activation
     // NOTE: we use alpha=1 for input_grad to accumulate gradients
     checkCUDA(cublasGemmEx(m->handle.blas,
@@ -338,7 +338,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            rank,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // Compute w0's gradiant
+    // Compute w0's gradient
     // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
@@ -359,7 +359,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            in_dim,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // Compute input gradiant
+    // Compute input gradient
     // NOTE: we use alpha=1 for input_grad to accumulate gradients
     if (input_grad_ptr != nullptr) {
       checkCUDA(cublasGemmEx(m->handle.blas,

From bdb590b3cb8e8e132856f75438bf155745480b91 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 22 Oct 2023 00:11:33 -0400
Subject: [PATCH 035/198] add_bias_residual_layer_norm backward and peft_bwd

---
 .../ops/add_bias_residual_layer_norm.h        |  60 +-
 src/ops/add_bias_residual_layer_norm.cc       | 354 +++++++++--
 src/ops/add_bias_residual_layer_norm.cpp      | 595 +++++++++++++++++-
 src/ops/add_bias_residual_layer_norm.cu       | 564 ++++++++++++++++-
 src/ops/fused.cpp                             |  14 +-
 src/ops/fused.cu                              |  14 +-
 src/ops/residual_layer_norm.cc                |  18 +-
 src/runtime/model.cc                          |  34 +
 8 files changed, 1568 insertions(+), 85 deletions(-)

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index bb470376c3..5c4a49f998 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -38,6 +38,11 @@ class AddBiasResidualLayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -61,6 +66,14 @@ class AddBiasResidualLayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -76,16 +89,51 @@ class AddBiasResidualLayerNorm : public Op {
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
-                                       int attn_bias_dim,
-                                       int residual_volume,
+  static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorR const &attn_bias,
+                                       GenericTensorAccessorR const &residual,
                                        GenericTensorAccessorW &added_output,
                                        GenericTensorAccessorW &output,
-                                       GenericTensorAccessorR const &residual,
-                                       GenericTensorAccessorR const &attn_bias,
                                        GenericTensorAccessorR const &gamma,
                                        GenericTensorAccessorR const &beta);
+  template <typename T>
+  static void backward_kernel(AddBiasResidualLayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T const *added_output_ptr,
+                              T *input_grad_ptr,
+                              T *residual_grad_ptr,
+                              T *attn_bias_grad_ptr,
+                              T const *gamma_ptr,
+                              T *gamma_grad_ptr,
+                              T *beta_grad_ptr,
+                              ffStream_t stream);
+  static void
+      backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR &added_output,
+                              GenericTensorAccessorW &input_grad,
+                              GenericTensorAccessorW const &residual_grad,
+                              GenericTensorAccessorW const &attn_bias_grad,
+                              GenericTensorAccessorR const &gamma,
+                              GenericTensorAccessorW const &gamma_grad,
+                              GenericTensorAccessorW const &beta_grad);
+  template <typename T>
+  static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T *input_grad_ptr,
+                              T *residual_grad_ptr,
+                              T *attn_bias_grad_ptr,
+                              T const *gamma_ptr,
+                              ffStream_t stream);
+  static void
+      peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW &input_grad,
+                              GenericTensorAccessorW const &residual_grad,
+                              GenericTensorAccessorW const &attn_bias_grad,
+                              GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias;
@@ -107,6 +155,8 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 42fbb3016a..5d19dffdbc 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -517,10 +517,6 @@ void AddBiasResidualLayerNorm::forward(FFModel const &ff) {
   assert(false);
 }
 
-void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
-  assert(false);
-}
-
 FutureMap AddBiasResidualLayerNorm::inference(
     FFModel const &ff,
     BatchConfigFuture const &bc,
@@ -546,50 +542,51 @@ FutureMap AddBiasResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  // attn output
+  // input
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
+  // attn bias
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(1, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
+  launcher.add_field(2, FID_DATA);
+  // added_output: input + attn bias + residual
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  // layer norm output
+  launcher.add_field(3, FID_DATA);
+  // output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
-  // attn final bias
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
   launcher.add_field(4, FID_DATA);
   if (elementwise_affine) {
+    // gamma
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
     launcher.add_field(5, FID_DATA);
-
     if (use_bias) {
+      // beta
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
                                                         0 /*projection id*/,
                                                         READ_ONLY,
@@ -602,11 +599,11 @@ FutureMap AddBiasResidualLayerNorm::inference(
 }
 
 /*
-  regions[0](I): attn output
-  regions[1](I): residual
-  regions[2](O): added output (attn output + final attn bias + residual)
-  regions[3](O): layer norm output
-  regions[4](I): final attn bias
+  regions[0](I): input
+  regions[1](I): attn bias
+  regions[2](I): residual
+  regions[3](O): added output
+  regions[4](O): output
   regions[5](I): gamma
   regions[6](I): beta
 */
@@ -630,26 +627,28 @@ void AddBiasResidualLayerNorm::inference_task(
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+      m->output_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+      m->output_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+
   GenericTensorAccessorR gamma, beta;
 
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain residual_domain = runtime->get_index_space_domain(
+  Domain attn_bias_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
-  Domain added_out_domain = runtime->get_index_space_domain(
+  Domain residual_domain = runtime->get_index_space_domain(
       ctx, task->regions[2].region.get_index_space());
-  Domain out_domain = runtime->get_index_space_domain(
+  Domain added_out_domain = runtime->get_index_space_domain(
       ctx, task->regions[3].region.get_index_space());
-  Domain attn_bias_domain = runtime->get_index_space_domain(
+  Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[4].region.get_index_space());
+
   Domain gamma_domain, beta_domain;
 
   assert(in_domain.get_volume() == out_domain.get_volume());
@@ -707,16 +706,7 @@ void AddBiasResidualLayerNorm::inference_task(
   }
 
   AddBiasResidualLayerNorm::inference_kernel_wrapper(
-      m,
-      (int)attn_bias_dim,
-      (int)residual_domain.get_volume(),
-      input,
-      added_output,
-      output,
-      residual,
-      attn_bias,
-      gamma,
-      beta);
+      m, bc, input, attn_bias, residual, added_output, output, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -739,6 +729,288 @@ void AddBiasResidualLayerNorm::inference_task(
   }
 }
 
+void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // added output
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // attn bias
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
+    launcher.add_field(field_id++, FID_DATA);
+    // gamma_grad
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+    if (use_bias) {
+      // beta_grad
+      launcher.add_region_requirement(
+          RegionRequirement(weights[2]->part_grad,
+                            0 /*projection id*/,
+                            READ_WRITE,
+                            EXCLUSIVE,
+                            weights[2]->region_grad));
+      launcher.add_field(field_id++, FID_DATA);
+    }
+  }
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void AddBiasResidualLayerNorm::backward_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  AddBiasResidualLayerNormMeta *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         5 + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR added_output =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW attn_bias_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[2],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad =
+        helperGetGenericTensorAccessorRW(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+    if (m->use_bias) {
+      beta_grad =
+          helperGetGenericTensorAccessorRW(m->output_type[0],
+                                           regions[region_idx++],
+                                           task->regions[task_region_idx++],
+                                           FID_DATA,
+                                           ctx,
+                                           runtime);
+    }
+  }
+  AddBiasResidualLayerNorm::backward_kernel_wrapper(m,
+                                                    output_grad,
+                                                    added_output,
+                                                    input_grad,
+                                                    residual_grad,
+                                                    attn_bias_grad,
+                                                    gamma,
+                                                    gamma_grad,
+                                                    beta_grad);
+}
+
+Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[1]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // attn bias grad
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[2]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void AddBiasResidualLayerNorm::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  AddBiasResidualLayerNormMeta const *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() == 4 + m->elementwise_affine);
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  GenericTensorAccessorW attn_bias_grad =
+      helperGetGenericTensorAccessorRW(m->weight_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  GenericTensorAccessorR gamma;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+  }
+  AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+      m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma);
+}
+
 bool AddBiasResidualLayerNorm::measure_operator_cost(
     Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const {
   return false;
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 1add43ecd9..a0fdd1d1f7 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -23,6 +23,7 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
@@ -74,6 +75,23 @@ __inline__ __device__ T WarpReduceSum(T val) {
   return val;
 }
 
+template <typename T>
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
+  int const lid = threadIdx.x % C10_WARP_SIZE;
+  int const wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceSum(val);
+  }
+  return val;
+}
+
 template <typename T>
 __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
@@ -216,19 +234,77 @@ void AddBiasResidualLayerNorm::inference_kernel(
 
 /*static*/
 void AddBiasResidualLayerNorm::inference_kernel_wrapper(
-    AddBiasResidualLayerNormMeta const *m,
-    int attn_bias_dim,
-    int residual_volume,
+    AddBiasResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &attn_bias,
+    GenericTensorAccessorR const &residual,
     GenericTensorAccessorW &added_output,
     GenericTensorAccessorW &output,
-    GenericTensorAccessorR const &residual,
-    GenericTensorAccessorR const &attn_bias,
     GenericTensorAccessorR const &gamma,
     GenericTensorAccessorR const &beta) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim =
+          added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->input_activation = allocator->allocate_instance_untyped(
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+  // inference kernel
+  int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
+  int residual_volume = residual.domain.get_volume();
   if (m->input_type[0] == DT_FLOAT) {
     AddBiasResidualLayerNorm::inference_kernel<float>(
         m,
@@ -260,4 +336,513 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    dX_i[l] += f_grad_input;
+    dX_residual1_i[l] += f_grad_input;
+    if (dX_residual2 != nullptr) {
+      dX_residual2_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::backward_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T const *added_output_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    T *gamma_grad_ptr,
+    T *beta_grad_ptr,
+    hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     gamma_ptr,
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel),
+                     B,
+                     kCUDANumThreads,
+                     0,
+                     stream,
+                     M,
+                     N,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual_grad_ptr,
+                     attn_bias_grad_ptr,
+                     N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel),
+                         B,
+                         kCUDANumThreads,
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel),
+                         B,
+                         dim3(kThreadX, kThreadY),
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::backward_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR &added_output,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual_grad.get_float_ptr(),
+        attn_bias_grad.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual_grad.get_half_ptr(),
+        attn_bias_grad.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::peft_bwd_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     static_cast<T const *>(m->input_activation),
+                     gamma_ptr,
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel),
+                     B,
+                     kCUDANumThreads,
+                     0,
+                     stream,
+                     M,
+                     N,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     static_cast<T const *>(m->input_activation),
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual_grad_ptr,
+                     attn_bias_grad_ptr,
+                     N);
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual_grad.get_float_ptr(),
+                    attn_bias_grad.get_float_ptr(),
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual_grad.get_half_ptr(),
+                    attn_bias_grad.get_half_ptr(),
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 07f1f2af6b..097ace3676 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -22,6 +22,7 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
@@ -73,6 +74,23 @@ __inline__ __device__ T WarpReduceSum(T val) {
   return val;
 }
 
+template <typename T>
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
+  int const lid = threadIdx.x % C10_WARP_SIZE;
+  int const wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceSum(val);
+  }
+  return val;
+}
+
 template <typename T>
 __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
@@ -189,14 +207,13 @@ void AddBiasResidualLayerNorm::inference_kernel(
 
 /*static*/
 void AddBiasResidualLayerNorm::inference_kernel_wrapper(
-    AddBiasResidualLayerNormMeta const *m,
-    int attn_bias_dim,
-    int residual_volume,
+    AddBiasResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &attn_bias,
+    GenericTensorAccessorR const &residual,
     GenericTensorAccessorW &added_output,
     GenericTensorAccessorW &output,
-    GenericTensorAccessorR const &residual,
-    GenericTensorAccessorR const &attn_bias,
     GenericTensorAccessorR const &gamma,
     GenericTensorAccessorR const &beta) {
   cudaStream_t stream;
@@ -208,6 +225,65 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim =
+          added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->input_activation = allocator->allocate_instance_untyped(
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + tokens_previous_requests * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+  // inference kernel
+  int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
+  int residual_volume = residual.domain.get_volume();
   if (m->input_type[0] == DT_FLOAT) {
     AddBiasResidualLayerNorm::inference_kernel<float>(
         m,
@@ -297,4 +373,482 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    dX_i[l] += f_grad_input;
+    dX_residual1_i[l] += f_grad_input;
+    if (dX_residual2 != nullptr) {
+      dX_residual2_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::backward_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T const *added_output_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    T *gamma_grad_ptr,
+    T *beta_grad_ptr,
+    cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          added_output_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      added_output_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual_grad_ptr,
+      attn_bias_grad_ptr,
+      N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              added_output_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              added_output_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::backward_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR &added_output,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual_grad.get_float_ptr(),
+        attn_bias_grad.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual_grad.get_half_ptr(),
+        attn_bias_grad.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::peft_bwd_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          static_cast<T const *>(m->input_activation),
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T const *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual_grad_ptr,
+      attn_bias_grad_ptr,
+      N);
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual_grad.get_float_ptr(),
+                    attn_bias_grad.get_float_ptr(),
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual_grad.get_half_ptr(),
+                    attn_bias_grad.get_half_ptr(),
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 5fa18013e9..77ca372d2c 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -976,7 +976,7 @@ __host__ void
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
+        AddBiasResidualLayerNormMeta *m =
             (AddBiasResidualLayerNormMeta *)metas->meta[op];
         if (!m->elementwise_affine) {
           assert(fused->op_num_weights[op] == 1); // attn bias
@@ -994,20 +994,14 @@ __host__ void
             beta = my_weight_accessor[2];
           }
         }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
         AddBiasResidualLayerNorm::inference_kernel_wrapper(
             m,
-            attn_bias_dim,
-            residual_volume,
+            bc,
             my_input_accessor[0],
+            my_weight_accessor[0],
+            my_input_accessor[1],
             my_output_accessor[0],
             my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
             gamma,
             beta);
         break;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 255136099a..383e171662 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -578,7 +578,7 @@ __host__ void
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
+        AddBiasResidualLayerNormMeta *m =
             (AddBiasResidualLayerNormMeta *)metas->meta[op];
         if (!m->elementwise_affine) {
           assert(fused->op_num_weights[op] == 1); // attn bias
@@ -596,20 +596,14 @@ __host__ void
             beta = my_weight_accessor[2];
           }
         }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
         AddBiasResidualLayerNorm::inference_kernel_wrapper(
             m,
-            attn_bias_dim,
-            residual_volume,
+            bc,
             my_input_accessor[0],
+            my_weight_accessor[0],
+            my_input_accessor[1],
             my_output_accessor[0],
             my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
             gamma,
             beta);
         break;
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index ce82ec6702..6c1f4ef934 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -639,7 +639,7 @@ void ResidualLayerNorm::backward_task(
                                        runtime);
   GenericTensorAccessorW residual2_grad;
   if (m->use_two_residuals) {
-    GenericTensorAccessorW residual2_grad =
+    residual2_grad =
         helperGetGenericTensorAccessorRW(m->input_type[2],
                                          regions[region_idx++],
                                          task->regions[task_region_idx++],
@@ -708,33 +708,33 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
                          machine_view_hash);
   int field_id = 0;
   // output_grad
-  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
-                                                    outputs[1]->region_grad));
+                                                    batch_outputs[1]->region));
   launcher.add_field(field_id++, FID_DATA);
   // input grad
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
-                                                    inputs[0]->region_grad));
+                                                    batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual grad 1
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
-                                                    inputs[1]->region_grad));
+                                                    batch_inputs[1]->region));
   launcher.add_field(field_id++, FID_DATA);
   if (use_two_residuals) {
     // residual grad 2
-    launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part,
                                                       0 /*projection id*/,
                                                       READ_WRITE,
                                                       EXCLUSIVE,
-                                                      inputs[2]->region_grad));
+                                                      batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   if (elementwise_affine) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 8939e9e74d..500146b42c 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5327,6 +5327,40 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                                   "AddBiasResidualLayerNorm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          AddBiasResidualLayerNorm::backward_task>(
+          registrar, "AddBiasResidualLayerNorm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AddBiasResidualLayerNorm::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "AddBiasResidualLayerNorm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          AddBiasResidualLayerNorm::peft_bwd_task>(
+          registrar, "AddBiasResidualLayerNorm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AddBiasResidualLayerNorm::peft_bwd_task>(
+          registrar);
+    }
+  }
   // SigmoidSiluMulti task
   {
     TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID,

From 60c0418301d9ccb935cc9f9807a4702170afd64a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sun, 22 Oct 2023 14:50:24 +0000
Subject: [PATCH 036/198] implement IncMHA peft_bwd

---
 .../ops/inc_multihead_self_attention.h        |   3 +
 src/ops/inc_multihead_self_attention.cu       | 336 +++++++++++++++++-
 2 files changed, 338 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 76569de4cb..4fe79a1d87 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -218,6 +218,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   //  typedef hipFloatComplex attFloatComplex;
   hipFloatComplex *complex_input;
 #endif
+  // PEFT specific fields
+  void *softmax_activation_buffer;
+  void *query_activation_buffer;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 0e3d90e02c..58831292ae 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -455,7 +455,341 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      DT const *output_grad_ptr,
                      DT const *bias_ptr,
                      cudaStream_t stream) {
-  assert(false);
+  assert(!m->offload);
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = cublas_data_type;
+#endif
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].num_tokens_in_batch;
+    // Currently assume we are calculating gradients for all tokens
+    // of a request
+    assert(num_tokens == num_total_tokens);
+    int kt_block_size = m->kProjSize;
+    int kt_req_block_size =
+        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    int vt_block_size = m->vProjSize;
+    int vt_req_block_size =
+        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    // Step 1: compute gradients before final projection
+    {
+      int m_ = m->vProjSize * m->num_q_heads;
+      int n_ = num_tokens;
+      int k_ = m->oProjSize;
+      int lda = k_;
+      int ldb = n_;
+      int ldc = m_;
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: output projection weight
+      // matrix A's layout: [num_heads, vProjSize, oProjSize]
+      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                             m->kProjSize * m->num_q_heads +
+                                             m->vProjSize * m->num_q_heads);
+      // matrix B: output gradients
+      // matrix B's layout: [num_new_tokens, oProjSize]
+      DT const *B =
+          output_grad_ptr +
+          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
+      // matrix C: attn_heads gradients
+      // matrix C's layout: [num_new_tokens, num_heads, vProjSize]
+      DT *C = static_cast<DT *>(m->handle.workSpace);
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
+                             m_,
+                             n_,
+                             k_,
+                             &alpha,
+                             A,
+                             cublas_data_type,
+                             lda,
+                             B,
+                             cublas_data_type,
+                             ldb,
+                             &beta,
+                             C,
+                             cublas_data_type,
+                             ldc,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Step 2: compute gradients w.r.t. value
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [num_tokens, num_heads, vProjSize]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: qk_prods_softmax
+      // matrix B's layout: [num_heads, num_tokens, num_tokens]
+      DT const *B = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize +
+      // vProjSize]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) + m->qProjSize + m->kProjSize;
+      int m_ = m->vProjSize;
+      int n_ = num_tokens;
+      int k_ = num_tokens;
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = num_tokens;
+      int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int strideA = m->vProjSize;
+      int strideB = num_tokens * num_tokens;
+      int strideC = m->qProjSize + m->kProjSize + m->vProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      int m_ = num_tokens;
+      int n_ = num_tokens;
+      int k_ = m->vProjSize;
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = m->vProjSize;
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * num_tokens;
+      // matrix A: value cache
+      // matrix A's layout: [num_req, max_num_tokens, num_heads, vProjSize]
+      DT const *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [num_new_tokens, num_heads, vProjSize]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_heads, num_total_tokens, num_new_tokens]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Step 4: softmax backpropagation
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      int n_param = m->num_q_heads;
+      int c_param = num_tokens;
+      int h_param = 1;
+      int w_param = num_tokens;
+      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
+                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_data_type,
+                                            n_param,
+                                            c_param,
+                                            h_param,
+                                            w_param));
+      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      m->qk_tensor,
+                                      m->softmax_activation_buffer,
+                                      m->qk_tensor,
+                                      m->qk_prods_softmax,
+                                      &beta,
+                                      m->qk_tensor,
+                                      m->qk_prods));
+      // TODO: fill all elements above diagonal to force causal attention
+    }
+    // Step 5: compute gradients w.r.t. key
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: query activation (in query_activation_buffer)
+      // matrix A's layout: [num_tokens, num_heads, m->qProjSize]
+      DT const *A = static_cast<DT *>(m->query_activation_buffer);
+      // matrix B: gradients w.r.t. qk_prods
+      // matrix B's layout: [num_heads, num_tokens, num_tokens]
+      DT const *B = static_cast<DT *>(m->qk_prods);
+      // matrix C: gradients w.r.t. key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize +
+      // vProjSize]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) + m->qProjSize;
+      int m_ = m->kProjSize;
+      int n_ = num_tokens;
+      int k_ = num_tokens;
+      int lda = m->num_q_heads * m->qProjSize;
+      int ldb = num_tokens;
+      int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int strideA = m->qProjSize;
+      int strideB = num_tokens * num_tokens;
+      int strideC = m->qProjSize + m->kProjSize + m->vProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Step 6: compute gradients w.r.t query
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: key cache
+      // matrix A's layout: [num_tokens, num_heads, m->kProjSize]
+      DT const *A = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix B: gradients w.r.t. qk_prods
+      // matrix B's layout: [num_heads, num_tokens, num_tokens]
+      DT const *B = static_cast<DT *>(m->qk_prods);
+      // matrix C: gradients w.r.t. query (saved as part of m->devQKVProjArray)
+      // matrix C's layout:
+      // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize]
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      int m_ = m->qProjSize;
+      int n_ = num_tokens;
+      int k_ = num_tokens;
+      int lda = m->kProjSize * m->num_q_heads;
+      int ldb = num_tokens;
+      int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int strideA = m->kProjSize;
+      int strideB = num_tokens * num_tokens;
+      int strideC = m->qProjSize + m->kProjSize + m->vProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Step 7: compute gradients w.r.t. input
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix A: QKV projection weights
+      // matrix A's layout:
+      // [(qProjSize + kProjSize + vProjSize) * num_q_heads, qSize]
+      DT const *A = weight_ptr;
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout:
+      // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [num_tokens, m->qSize]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int lda = m_;
+      int ldb = k_;
+      int ldc = m_;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             m_,
+                             n_,
+                             k_,
+                             &alpha,
+                             A,
+                             cublas_data_type,
+                             lda,
+                             B,
+                             cublas_data_type,
+                             ldb,
+                             &beta,
+                             C,
+                             cublas_data_type,
+                             ldc,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+  }
 }
 
 } // namespace IncMultiHeadAttention

From 509c54cec8bd7be1ffa99282f338afe3e9126b87 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sun, 22 Oct 2023 17:31:01 -0400
Subject: [PATCH 037/198] several bug fixes

---
 src/ops/kernels/lora_linear_kernels.cu |  4 ++--
 src/ops/sigmoid_silu_multi.cpp         | 10 ++++++++--
 src/ops/sigmoid_silu_multi.cu          | 10 ++++++++--
 src/runtime/model.cc                   |  3 ++-
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index a3fc071f11..8ea2455cd0 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -263,9 +263,9 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   DT alpha = 1.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
-  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
-  assert(weight_type == ff_to_cuda_datatype(m->weight_type[1]));
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
 #if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index ccd622ff17..0a9a814f5e 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -293,8 +293,14 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
       num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     }
   }
-  assert(num_peft_requests == 1);
-  assert(num_peft_tokens >= 1);
+  if (num_peft_requests == 0) {
+    // No PEFT requests
+    return;
+  } else {
+    // Otherwise assume at most 1 peft request
+    assert(num_peft_requests == 1);
+    assert(num_peft_tokens >= 1);
+  }
   int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
 
   if (m->input_type[0] == DT_FLOAT) {
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 597f7ecdab..bb78973f70 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -283,8 +283,14 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
       num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     }
   }
-  assert(num_peft_requests == 1);
-  assert(num_peft_tokens >= 1);
+  if (num_peft_requests == 0) {
+    // No PEFT requests
+    return;
+  } else {
+    // Otherwise assume at most 1 peft request
+    assert(num_peft_requests == 1);
+    assert(num_peft_tokens >= 1);
+  }
   int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
 
   if (m->input_type[0] == DT_FLOAT) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 500146b42c..4ccfe25a97 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1486,7 +1486,8 @@ OpMeta::OpMeta(FFHandler _handle)
 #endif
 
 OpMeta::OpMeta(FFHandler _handle, Op const *op)
-    : profiling(op->profiling), inference_debugging(op->inference_debugging) {
+    : handle(_handle), profiling(op->profiling),
+      inference_debugging(op->inference_debugging) {
   for (int i = 0; i < op->numInputs; i++) {
     trainable_inputs[i] = op->trainable_inputs[i];
     reset_input_grads[i] = op->reset_input_grads[i];

From bc9f538ac4ae7c274e96de95f14af374d20d1896 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 23 Oct 2023 01:04:06 +0000
Subject: [PATCH 038/198] [rms_norm] do not compute non-peft-bwd tokens in
 peft-bwd

---
 .../ops/kernels/residual_rms_norm_kernels.h   |  1 +
 .../flexflow/ops/kernels/rms_norm_kernels.h   |  1 +
 src/ops/fused.cu                              |  3 +-
 src/ops/kernels/lora_linear_kernels.cu        | 24 ++---
 src/ops/kernels/residual_rms_norm_kernels.cu  | 90 +++++++++++--------
 src/ops/kernels/rms_norm_kernels.cu           | 54 +++++++----
 src/ops/residual_layer_norm.cc                |  1 +
 src/ops/residual_rms_norm.cc                  |  4 +-
 src/ops/rms_norm.cc                           |  4 +-
 9 files changed, 107 insertions(+), 75 deletions(-)

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 4fbe34f83f..3091f83675 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -67,6 +67,7 @@ void backward_kernel_wrapper(
     GenericTensorAccessorR const &weight,
     GenericTensorAccessorW const &weight_grad);
 void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorW const &residual_input0_grad,
                              GenericTensorAccessorW const &residual_input1_grad,
diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 72176f0383..92e5e04af3 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -62,6 +62,7 @@ void backward_kernel_wrapper(RMSNormMeta const *m,
                              GenericTensorAccessorR const &weight,
                              GenericTensorAccessorW const &weight_grad);
 void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &weight);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 383e171662..1f6614d341 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -994,19 +994,20 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(fused->op_num_outputs[op] == 1);
         RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
         Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
+                                                  bc,
                                                   my_output_grad_accessor[0],
                                                   my_input_grad_accessor[0],
                                                   my_weight_accessor[0]);
         break;
       }
       case OP_RESIDUAL_RMS_NORM: {
-        // TODO: implement me
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 2);
         ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
         Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
             m,
+            bc,
             my_output_grad_accessor[0],
             my_input_grad_accessor[0],
             my_input_grad_accessor[1],
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 8ea2455cd0..c26803bcee 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -166,18 +166,16 @@ void inference_kernel(LoraLinearMeta *m,
   }
   // Assert that we have at most one request that requires peft_bwd
   assert(num_peft_requests <= 1);
-  int tokens_previous_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
     }
     // Skip non-PEFT requests
     if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      // FIXME: use the new approach to computing token offset
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_weights.end());
     LoraLinearWeight weight =
@@ -192,7 +190,7 @@ void inference_kernel(LoraLinearMeta *m,
           data_type_size(m->input_type[1]) * num_peft_tokens * rank);
       // copy input activation
       checkCUDA(cudaMemcpyAsync(m->input_activation,
-                                input_ptr + tokens_previous_requests * in_dim,
+                                input_ptr + first_token_offset * in_dim,
                                 data_type_size(m->input_type[0]) *
                                     num_peft_tokens * in_dim,
                                 cudaMemcpyDeviceToDevice,
@@ -215,7 +213,7 @@ void inference_kernel(LoraLinearMeta *m,
                            weight.w0_ptr,
                            weight_type,
                            in_dim,
-                           input_ptr + tokens_previous_requests * in_dim,
+                           input_ptr + first_token_offset * in_dim,
                            input_type,
                            in_dim,
                            &beta,
@@ -241,14 +239,12 @@ void inference_kernel(LoraLinearMeta *m,
                            lr_actv_type,
                            rank,
                            &alpha,
-                           output_ptr + tokens_previous_requests * out_dim,
+                           output_ptr + first_token_offset * out_dim,
                            output_type,
                            out_dim,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    tokens_previous_requests += num_peft_tokens;
   }
-  assert(tokens_previous_requests == bc->num_active_tokens());
 }
 
 template <typename DT>
@@ -274,22 +270,20 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   cudaDataType_t compute_type = CUDA_R_32F;
 #endif
 
-  int tokens_previous_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
     }
     // Skip non-PEFT requests
     if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
     // Skip PEFT forward-only requests
     if (!bc->requestsInfo[i].peft_bwd) {
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_weights.end());
     LoraLinearWeight weight =
@@ -307,7 +301,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            m->low_rank_activation,
                            lr_actv_type,
                            rank,
-                           output_grad_ptr + tokens_previous_requests * out_dim,
+                           output_grad_ptr + first_token_offset * out_dim,
                            output_type,
                            out_dim,
                            &alpha,
@@ -329,7 +323,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            weight.w1_ptr,
                            weight_type,
                            rank,
-                           output_grad_ptr + tokens_previous_requests * out_dim,
+                           output_grad_ptr + first_token_offset * out_dim,
                            output_type,
                            out_dim,
                            &alpha,
@@ -376,15 +370,13 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                              lr_actv_type,
                              rank,
                              &alpha,
-                             input_grad_ptr + tokens_previous_requests * in_dim,
+                             input_grad_ptr + first_token_offset * in_dim,
                              input_type,
                              in_dim,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
-    tokens_previous_requests += num_peft_tokens;
   }
-  assert(tokens_previous_requests == bc->num_active_tokens());
 }
 
 } // namespace Internal
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 53804c0b1b..de84e50e29 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -274,18 +274,16 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
     }
     assert(num_peft_requests <= 1);
 
-    int tokens_previous_requests = 0;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
       // Skip non-PEFT requests
       if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-        // FIXME: use the new approach to computing token offset
-        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
@@ -293,21 +291,19 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
             data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
-          checkCUDA(cudaMemcpyAsync(m->input_activation,
-                                    residual_output.get_float_ptr() +
-                                        tokens_previous_requests * in_dim,
-                                    data_type_size(m->input_type[0]) *
-                                        num_peft_tokens * in_dim,
-                                    cudaMemcpyDeviceToDevice,
-                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              residual_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
         } else if (m->input_type[0] == DT_HALF) {
-          checkCUDA(cudaMemcpyAsync(m->input_activation,
-                                    residual_output.get_half_ptr() +
-                                        tokens_previous_requests * in_dim,
-                                    data_type_size(m->input_type[0]) *
-                                        num_peft_tokens * in_dim,
-                                    cudaMemcpyDeviceToDevice,
-                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              residual_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
         } else {
           assert(false && "unsupport datatype in layernorm");
         }
@@ -437,33 +433,48 @@ void backward_kernel(ResidualRMSNormMeta const *m,
 
 template <typename T>
 void peft_bwd_kernel(ResidualRMSNormMeta const *m,
+                     BatchConfig const *bc,
                      T const *output_grad_ptr,
                      T *residual_input0_grad_ptr,
                      T *residual_input1_grad_ptr,
                      T const *weight_ptr,
                      cudaStream_t stream) {
-  const int64_t M = m->batch_size;
-  const int64_t N = m->num_elements;
-  T const *residual_output_rms_input_ptr =
-      static_cast<T *>(m->input_activation);
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          residual_output_rms_input_ptr,
-          weight_ptr,
-          static_cast<T *>(m->rms_ptr),
-          static_cast<T *>(m->norm_ptr));
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
 
-  RMSNormBackwardCUDAKernel<T>
-      <<<M, kCUDANumThreads, 0, stream>>>(N,
-                                          output_grad_ptr,
-                                          residual_output_rms_input_ptr,
-                                          weight_ptr,
-                                          static_cast<T *>(m->rms_ptr),
-                                          static_cast<T *>(m->norm_ptr),
-                                          residual_input0_grad_ptr,
-                                          residual_input1_grad_ptr);
+    const int64_t M = bc->requestsInfo[i].num_tokens_in_batch;
+    const int64_t N = m->num_elements;
+    T const *residual_output_rms_input_ptr =
+        static_cast<T *>(m->input_activation);
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+            N,
+            output_grad_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, kCUDANumThreads, 0, stream>>>(N,
+                                            output_grad_ptr,
+                                            residual_output_rms_input_ptr,
+                                            weight_ptr,
+                                            static_cast<T *>(m->rms_ptr),
+                                            static_cast<T *>(m->norm_ptr),
+                                            residual_input0_grad_ptr,
+                                            residual_input1_grad_ptr);
+  }
 }
 
 /*
@@ -536,6 +547,7 @@ void backward_kernel_wrapper(
   regions[3](I): weight
 */
 void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorW const &residual_input0_grad,
                              GenericTensorAccessorW const &residual_input1_grad,
@@ -554,6 +566,7 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
 
   if (output_grad.data_type == DT_HALF) {
     peft_bwd_kernel(m,
+                    bc,
                     output_grad.get_half_ptr(),
                     residual_input0_grad.get_half_ptr(),
                     residual_input1_grad.get_half_ptr(),
@@ -561,6 +574,7 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
                     stream);
   } else if (output_grad.data_type == DT_FLOAT) {
     peft_bwd_kernel(m,
+                    bc,
                     output_grad.get_float_ptr(),
                     residual_input0_grad.get_float_ptr(),
                     residual_input1_grad.get_float_ptr(),
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index ffb92613a5..8281506cbf 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -453,31 +453,47 @@ void backward_kernel_wrapper(RMSNormMeta const *m,
 
 template <typename T>
 void peft_bwd_kernel(RMSNormMeta const *m,
+                     BatchConfig const *bc,
                      T const *output_grad_ptr,
                      T *input_grad_ptr,
                      T const *weight_ptr,
                      cudaStream_t stream) {
-  const int64_t M = m->batch_size;
-  const int64_t N = m->num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T *>(m->input_activation),
-          weight_ptr,
-          static_cast<T *>(m->rms_ptr),
-          static_cast<T *>(m->c2_ptr));
-  RMSNormBackwardCUDAKernel<T>
-      <<<M, kCUDANumThreads, 0, stream>>>(N,
-                                          output_grad_ptr,
-                                          static_cast<T *>(m->input_activation),
-                                          weight_ptr,
-                                          static_cast<T *>(m->rms_ptr),
-                                          static_cast<T *>(m->c2_ptr),
-                                          input_grad_ptr);
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    const int64_t M = bc->requestsInfo[i].num_tokens_in_batch;
+    const int64_t N = m->num_elements;
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+            N,
+            output_grad_ptr,
+            static_cast<T *>(m->input_activation),
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->c2_ptr));
+    RMSNormBackwardCUDAKernel<T><<<M, kCUDANumThreads, 0, stream>>>(
+        N,
+        output_grad_ptr,
+        static_cast<T *>(m->input_activation),
+        weight_ptr,
+        static_cast<T *>(m->rms_ptr),
+        static_cast<T *>(m->c2_ptr),
+        input_grad_ptr);
+  }
 }
 
 void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &weight) {
@@ -494,12 +510,14 @@ void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
 
   if (output_grad.data_type == DT_HALF) {
     peft_bwd_kernel(m,
+                    bc,
                     output_grad.get_half_ptr(),
                     input_grad.get_half_ptr(),
                     weight.get_half_ptr(),
                     stream);
   } else if (output_grad.data_type == DT_FLOAT) {
     peft_bwd_kernel(m,
+                    bc,
                     output_grad.get_float_ptr(),
                     input_grad.get_float_ptr(),
                     weight.get_float_ptr(),
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 6c1f4ef934..754b6105fa 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -706,6 +706,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   int field_id = 0;
   // output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index e2bc29635a..a6ed1dca9b 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -616,6 +616,7 @@ Legion::FutureMap
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   // regions[0](I): RMS output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -660,6 +661,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
   assert(task->regions.size() == 4);
   assert(regions.size() == 4);
   ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW residual_input0_grad =
@@ -679,7 +681,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(
-      m, output_grad, residual_input0_grad, residual_input1_grad, weight);
+      m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
 }
 
 Op *ResidualRMSNorm::materialize(FFModel &ff,
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 332472e8e4..3c1b4d2570 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -529,6 +529,7 @@ Legion::FutureMap
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.add_future(bc);
   // regions[0](I): output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
@@ -566,13 +567,14 @@ void RMSNorm::peft_bwd_task(Task const *task,
   assert(task->regions.size() == 3);
   assert(regions.size() == 3);
   RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  peft_bwd_kernel_wrapper(m, output_grad, input_grad, weight);
+  peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight);
 }
 
 void RMSNorm::serialize(Legion::Serializer &sez) const {

From d8e92e9bfce26d897688260cfba1bb61cebb069a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 23 Oct 2023 12:12:50 -0400
Subject: [PATCH 039/198] .

---
 include/flexflow/request_manager.h |  8 ++-
 src/runtime/request_manager.cc     | 80 +++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 637b9623f1..52da9a38ba 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -70,6 +70,11 @@ struct Request {
   std::vector<struct BeamTree> beam_trees;
 };
 
+struct PEFTRequest : public Request {
+  std::vector < std::pair < std::vector<BatchConfig::TokenId>,
+      std::vector<BatchConfig::TokenId> dataset;
+};
+
 // store the result of beam search
 struct BeamTree {
   struct treeLayer {
@@ -227,7 +232,8 @@ class RequestManager {
   int bos_token_id;
   int eos_token_id;
   std::string output_filepath;
-  std::queue<Request> pending_request_queue;
+  std::queue<Request> pending_infr_request_queue;
+  std::queue<Request> pending_peft_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c0573a50a3..603be8b00d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -212,7 +212,7 @@ RequestManager::RequestGuid
     }
   }
 
-  pending_request_queue.push(request);
+  pending_infr_request_queue.push(request);
   all_requests[request.guid] = request;
 
   if (verbose) {
@@ -274,7 +274,71 @@ RequestManager::RequestGuid
     }
   }
 
-  pending_request_queue.push(request);
+  pending_infr_request_queue.push(request);
+  all_requests[request.guid] = request;
+  {
+    std::string output = "New request tokens:";
+    output = "[" + std::to_string(request.guid) + "]" + output;
+    for (int i = 0; i < request.tokens.size(); i++) {
+      output = output + " " + std::to_string(request.tokens[i]);
+    }
+    log_req_mgr.print("%s", output.c_str());
+  }
+
+  GenerationResult gr;
+  gr.guid = request.guid;
+  gr.input_text = prompt;
+  gr.input_tokens = request.tokens;
+  gr.output_text = prompt;
+  gr.output_tokens = request.tokens;
+  request_generation_results[request.guid] = gr;
+  return request.guid;
+}
+
+RequestManager::RequestGuid RequestManager::register_new_peft_request(
+    std::vector<std::pair<std::string, std::string>> const &dataset,
+    int max_sequence_length,
+    PEFTModelID peft_model_id) {
+  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  // Add a new request
+  PEFTRequest request;
+  request.status = Request::PENDING;
+  request.guid = next_available_guid++;
+  request.max_sequence_length = max_sequence_length;
+  request.peft_model_id = peft_model_id;
+  for (auto const &sample : dataset) {
+    std::vector<int32_t> input_tokens;
+    if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+      input_tokens.push_back(bos_token_id);
+    }
+    input_tokens.push_back(this->tokenizer_->Encode(sample.first));
+    std::vector<int32_t> output_tokens =
+        this->tokenizer_->Encode(sample.second);
+    if (input_tokens.size() + output_tokens.size() >
+        get_max_sequence_length()) {
+      std::cout << "Warning: too many tokens in sample, only load up to "
+                << get_max_sequence_length() << " tokens, but got "
+                << tokens.size() << ".\n";
+    } else {
+      request.dataset.push_back(std::make_pair(input_tokens, output_tokens);
+    }
+  }
+
+  // Currently don't support speculative inference for PEFT
+  assert(get_num_ssms() == 0);
+  if (get_num_ssms() == 0) {
+    std::cout << "No small speculative model registered, using incremental "
+                 "decoding."
+              << std::endl;
+  } else {
+    std::cout << "Num of models: " << get_num_ssms() << std::endl;
+    for (int i = 0; i < get_num_ssms(); i++) {
+      BeamTree beam_tree = BeamTree{};
+      request.beam_trees.push_back(beam_tree);
+    }
+  }
+
+  pending_infr_request_queue.push(request);
   all_requests[request.guid] = request;
   {
     std::string output = "New request tokens:";
@@ -368,10 +432,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   BatchConfig new_bc;
   for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
     if (old_bc.request_completed[i]) { // add new requests to the next batch
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
@@ -785,10 +849,10 @@ BeamSearchBatchConfig
   // Step 2: Initialize new request
   for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;

From 0a512d25f05e9b98e275cc83f60544b7c60cd9f0 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 23 Oct 2023 20:11:16 -0400
Subject: [PATCH 040/198] .

---
 include/flexflow/model.h                 |  6 +-
 include/flexflow/request_manager.h       | 14 ++--
 inference/incr_decoding/incr_decoding.cc | 10 ++-
 src/ops/kernels/lora_linear_kernels.cu   | 27 +++-----
 src/runtime/model.cu                     | 17 +----
 src/runtime/request_manager.cc           | 84 +++++++++++++++++++-----
 6 files changed, 96 insertions(+), 62 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 5d986c1329..b4d2fe53af 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -830,7 +830,11 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  GenerationResult generate(std::vector<std::string> &prompts,
+  GenerationResult generate(std::string const &prompts,
+                            int max_seq_length,
+                            PEFTModelID peft_model_id = PEFTModelID::NO_ID);
+
+  GenerationResult generate(std::vector<std::string> const &prompts,
                             int max_seq_length,
                             PEFTModelID peft_model_id = PEFTModelID::NO_ID);
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 52da9a38ba..f93fc4c080 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -66,13 +66,11 @@ struct Request {
 
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
-
   std::vector<struct BeamTree> beam_trees;
-};
-
-struct PEFTRequest : public Request {
-  std::vector < std::pair < std::vector<BatchConfig::TokenId>,
-      std::vector<BatchConfig::TokenId> dataset;
+  // PEFT field
+  std::vector<std::pair<std::vector<BatchConfig::TokenId>,
+                        std::vector<BatchConfig::TokenId>>>
+      dataset;
 };
 
 // store the result of beam search
@@ -118,11 +116,11 @@ class RequestManager {
   FFModel *get_model(int model_id);
 
   GenerationResult generate_incr_decoding(FFModel *model,
-                                          std::vector<std::string> &prompts,
+                                          std::vector<std::string> const &prompts,
                                           int max_seq_length,
                                           PEFTModelID peft_model_id);
   GenerationResult generate_spec_infer(FFModel *model,
-                                       std::vector<std::string> &prompts,
+                                       std::vector<std::string> const &prompts,
                                        int max_seq_length,
                                        PEFTModelID peft_model_id);
   GenerationResult get_generation_result(RequestGuid const &guid);
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 461d71b23a..9f3a0a4a5f 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -272,14 +272,20 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
     std::vector<std::string> prompts;
+    std::vector<std::pair<std::string, std::string>> dataset;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
       prompts.push_back(text);
+      dataset.push_back(std::make_pair(text, text));
     }
-    GenerationResult result =
-        model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id);
+    rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/, peft_model_id);
+    for (auto &prompt : prompts) {
+      GenerationResult result = model.generate(prompt, 128 /*max_sequence_length*/);
+    }
+    //GenerationResult result =
+    //    model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
   // Execution fence
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 8ea2455cd0..fd64c4710b 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -150,7 +150,7 @@ void inference_kernel(LoraLinearMeta *m,
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  cudaDataType_t compute_type = input_type;
+  cudaDataType_t compute_type = output_type;
 #endif
   int num_peft_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
@@ -166,18 +166,16 @@ void inference_kernel(LoraLinearMeta *m,
   }
   // Assert that we have at most one request that requires peft_bwd
   assert(num_peft_requests <= 1);
-  int tokens_previous_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
     }
     // Skip non-PEFT requests
     if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      // FIXME: use the new approach to computing token offset
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_weights.end());
     LoraLinearWeight weight =
@@ -192,7 +190,7 @@ void inference_kernel(LoraLinearMeta *m,
           data_type_size(m->input_type[1]) * num_peft_tokens * rank);
       // copy input activation
       checkCUDA(cudaMemcpyAsync(m->input_activation,
-                                input_ptr + tokens_previous_requests * in_dim,
+                                input_ptr + first_token_offset * in_dim,
                                 data_type_size(m->input_type[0]) *
                                     num_peft_tokens * in_dim,
                                 cudaMemcpyDeviceToDevice,
@@ -215,7 +213,7 @@ void inference_kernel(LoraLinearMeta *m,
                            weight.w0_ptr,
                            weight_type,
                            in_dim,
-                           input_ptr + tokens_previous_requests * in_dim,
+                           input_ptr + first_token_offset * in_dim,
                            input_type,
                            in_dim,
                            &beta,
@@ -241,14 +239,12 @@ void inference_kernel(LoraLinearMeta *m,
                            lr_actv_type,
                            rank,
                            &alpha,
-                           output_ptr + tokens_previous_requests * out_dim,
+                           output_ptr + first_token_offset * out_dim,
                            output_type,
                            out_dim,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    tokens_previous_requests += num_peft_tokens;
   }
-  assert(tokens_previous_requests == bc->num_active_tokens());
 }
 
 template <typename DT>
@@ -271,22 +267,19 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  cudaDataType_t compute_type = CUDA_R_32F;
+  cudaDataType_t compute_type = output_type;
 #endif
 
-  int tokens_previous_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
     }
     // Skip non-PEFT requests
     if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
     // Skip PEFT forward-only requests
     if (!bc->requestsInfo[i].peft_bwd) {
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
@@ -307,7 +300,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            m->low_rank_activation,
                            lr_actv_type,
                            rank,
-                           output_grad_ptr + tokens_previous_requests * out_dim,
+                           output_grad_ptr,
                            output_type,
                            out_dim,
                            &alpha,
@@ -329,7 +322,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            weight.w1_ptr,
                            weight_type,
                            rank,
-                           output_grad_ptr + tokens_previous_requests * out_dim,
+                           output_grad_ptr,
                            output_type,
                            out_dim,
                            &alpha,
@@ -376,15 +369,13 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                              lr_actv_type,
                              rank,
                              &alpha,
-                             input_grad_ptr + tokens_previous_requests * in_dim,
+                             input_grad_ptr,
                              input_type,
                              in_dim,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
-    tokens_previous_requests += num_peft_tokens;
   }
-  assert(tokens_previous_requests == bc->num_active_tokens());
 }
 
 } // namespace Internal
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 0c69c9a600..754a6b18d7 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -160,23 +160,10 @@ FFHandler
                          .only_kind(Memory::GPU_FB_MEM)
                          .best_affinity_to(task->target_proc)
                          .first();
-    Realm::Rect<1, coord_t> bounds(
-        Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(info->peft_activation_reserve_space_size - 1));
-    std::vector<size_t> field_sizes;
-    field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
-    Realm::RegionInstance::create_instance(workspaceInst,
-                                           gpu_mem,
-                                           bounds,
-                                           field_sizes,
-                                           0,
-                                           Realm::ProfilingRequestSet())
-        .wait();
-    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
     handle.peft_activation_allocator = new MemoryAllocator(gpu_mem);
-    handle.peft_activation_allocator->register_reserved_work_space(
-        ptr, info->peft_activation_reserve_space_size);
+    handle.peft_activation_allocator->create_legion_instance(
+        workspaceInst, info->peft_activation_reserve_space_size);
   }
 
   if (info->peft_weight_reserve_space_size > 0) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 603be8b00d..5631ea6523 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -301,26 +301,26 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request(
     PEFTModelID peft_model_id) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
-  PEFTRequest request;
+  Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
   request.max_sequence_length = max_sequence_length;
   request.peft_model_id = peft_model_id;
   for (auto const &sample : dataset) {
     std::vector<int32_t> input_tokens;
+    input_tokens = this->tokenizer_->Encode(sample.first);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
-      input_tokens.push_back(bos_token_id);
+      input_tokens.insert(input_tokens.begin(), bos_token_id);
     }
-    input_tokens.push_back(this->tokenizer_->Encode(sample.first));
     std::vector<int32_t> output_tokens =
         this->tokenizer_->Encode(sample.second);
     if (input_tokens.size() + output_tokens.size() >
         get_max_sequence_length()) {
       std::cout << "Warning: too many tokens in sample, only load up to "
                 << get_max_sequence_length() << " tokens, but got "
-                << tokens.size() << ".\n";
+                << input_tokens.size() + output_tokens.size() << ".\n";
     } else {
-      request.dataset.push_back(std::make_pair(input_tokens, output_tokens);
+      request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
     }
   }
 
@@ -338,23 +338,29 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request(
     }
   }
 
-  pending_infr_request_queue.push(request);
+  pending_peft_request_queue.push(request);
   all_requests[request.guid] = request;
   {
-    std::string output = "New request tokens:";
-    output = "[" + std::to_string(request.guid) + "]" + output;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      output = output + " " + std::to_string(request.tokens[i]);
+    for (size_t r = 0; r < request.dataset.size(); r++) {
+      std::string input = "[" + std::to_string(r) + "] input:";
+      std::string output = "[" + std::to_string(r) + "] output:";
+      for (size_t i = 0; i < request.dataset[r].first.size(); i++) {
+        input = input + " " + std::to_string(request.dataset[r].first[i]);
+      }
+      for (size_t i = 0; i < request.dataset[r].second.size(); i++) {
+        output = output + " " + std::to_string(request.dataset[r].second[i]);
+      }
+      log_req_mgr.print("%s", input.c_str());
+      log_req_mgr.print("%s", output.c_str());
     }
-    log_req_mgr.print("%s", output.c_str());
   }
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = prompt;
-  gr.input_tokens = request.tokens;
-  gr.output_text = prompt;
-  gr.output_tokens = request.tokens;
+  //gr.input_text = prompt;
+  //gr.input_tokens = request.tokens;
+  //gr.output_text = prompt;
+  //gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
   return request.guid;
 }
@@ -569,6 +575,40 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     }
   }
 
+  // Step 3: add PEFT bwd requests
+  if (pending_peft_request_queue.size() > 0) {
+    Request &request = pending_peft_request_queue.front();
+    assert(request.dataset.size() > 0);
+    int num_peft_tokens = request.dataset[0].first.size()
+                        + request.dataset[0].second.size();
+    if (num_peft_tokens + new_bc.num_active_tokens() <= get_max_tokens_per_batch()) {
+      // The last request slot is reserved for PEFT request
+      int peft_req_idx = get_max_requests_per_batch() - 1;
+      assert(new_bc.request_completed[peft_req_idx]);
+      new_bc.request_completed[peft_req_idx] = false;
+      new_bc.requestsInfo[peft_req_idx].first_token_depth_in_request = 0;
+      new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch = new_bc.num_tokens;
+      new_bc.requestsInfo[peft_req_idx].num_tokens_in_batch = num_peft_tokens;
+      new_bc.requestsInfo[peft_req_idx].max_sequence_length = request.max_sequence_length;
+      new_bc.requestsInfo[peft_req_idx].request_guid = request.guid;
+      new_bc.requestsInfo[peft_req_idx].peft_model_id = request.peft_model_id;
+      new_bc.requestsInfo[peft_req_idx].peft_bwd = true;
+      for (size_t i = 0; i < request.dataset[0].first.size(); i++) {
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].first[i];
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i;
+        new_bc.num_tokens ++;
+      }
+      for (size_t i = 0; i < request.dataset[0].second.size(); i++) {
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].second[i];
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens;
+        int depth = request.dataset[0].first.size() + i;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+        new_bc.num_tokens ++;
+      }
+    }
+  }
+
   return new_bc;
 }
 
@@ -1875,7 +1915,15 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
   return merged_tree;
 }
 
-GenerationResult FFModel::generate(std::vector<std::string> &prompts,
+GenerationResult FFModel::generate(std::string const &prompt,
+                                   int max_seq_length,
+                                   PEFTModelID peft_model_id) {
+  std::vector<std::string> prompts;
+  prompts.push_back(prompt);
+  return generate(prompts, max_seq_length, peft_model_id);
+}
+
+GenerationResult FFModel::generate(std::vector<std::string> const &prompts,
                                    int max_seq_length,
                                    PEFTModelID peft_model_id) {
   RequestManager *rm = RequestManager::get_request_manager();
@@ -1995,7 +2043,7 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
 /*static*/
 GenerationResult
     RequestManager::generate_incr_decoding(FFModel *llm,
-                                           std::vector<std::string> &prompts,
+                                           std::vector<std::string> const &prompts,
                                            int max_seq_length,
                                            PEFTModelID peft_model_id) {
   InferenceManager *im = InferenceManager::get_inference_manager();
@@ -2056,7 +2104,7 @@ GenerationResult
 /*static*/
 GenerationResult
     RequestManager::generate_spec_infer(FFModel *llm,
-                                        std::vector<std::string> &prompts,
+                                        std::vector<std::string> const &prompts,
                                         int max_seq_length,
                                         PEFTModelID peft_model_id) {
   InferenceManager *im = InferenceManager::get_inference_manager();

From 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 23 Oct 2023 20:24:58 -0400
Subject: [PATCH 041/198] Update the default cublas behavior when CUDA_VERSION
 is not specified

---
 src/ops/inc_multihead_self_attention.cpp      | 14 ++++++++------
 src/ops/inc_multihead_self_attention.cu       | 12 ++++++------
 src/ops/kernels/linear_kernels.cpp            | 18 ++++++++++--------
 src/ops/kernels/linear_kernels.cu             | 12 ++++++------
 src/ops/spec_inc_multihead_self_attention.cpp |  7 ++++---
 src/ops/spec_inc_multihead_self_attention.cu  |  6 +++---
 src/ops/tree_inc_multihead_self_attention.cpp |  7 ++++---
 src/ops/tree_inc_multihead_self_attention.cu  |  6 +++---
 8 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 37cc986f5e..d60386f927 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -257,10 +257,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
@@ -509,10 +510,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 3b24a5a324..7080cbf05b 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -238,11 +238,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
@@ -508,11 +508,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 231ca0f3d7..4354409f54 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -241,11 +241,12 @@ void forward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
-  hipblasDatatype_t compute_type = input_type;
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
@@ -337,11 +338,12 @@ void backward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
-  hipblasDatatype_t compute_type = HIPBLAS_R_32F;
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 8a93357dcf..d8a9b5aa16 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -311,11 +311,11 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = input_type;
 #endif
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
@@ -401,11 +401,11 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = CUDA_R_32F;
 #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 1d81ae0c11..b1687d12a2 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -200,10 +200,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index ac74eb1c8f..681c7a0f72 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -215,11 +215,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 1d9ebf67e0..26291fb3b4 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -157,10 +157,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index edf7a2d075..758a93bbf7 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -158,11 +158,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;

From 464424ee2c5cf3f4f27dd5e368cbf7b6351a57d1 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 23 Oct 2023 21:49:37 -0400
Subject: [PATCH 042/198] fix bugs in IncMHA peft_bwd kernel

---
 .../ops/inc_multihead_self_attention.h        |  5 +-
 include/flexflow/request_manager.h            |  9 +-
 inference/incr_decoding/incr_decoding.cc      | 10 ++-
 src/ops/fused.cu                              |  4 +-
 src/ops/inc_multihead_self_attention.cu       | 89 ++++++++++++++-----
 src/ops/kernels/linear_kernels.cpp            |  9 +-
 src/ops/kernels/linear_kernels.cu             |  6 +-
 src/ops/kernels/lora_linear_kernels.cu        | 14 ++-
 src/runtime/request_manager.cc                | 41 +++++----
 9 files changed, 121 insertions(+), 66 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 4fe79a1d87..8da8412c69 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -121,15 +121,14 @@ class IncMultiHeadSelfAttention : public Op {
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
-
-  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m,
+  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
                                        BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &weight,
                                        GenericTensorAccessorW const &output,
                                        GenericTensorAccessorR const &bias);
-  static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m,
+  static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
                                       BatchConfig const *bc,
                                       int shard_id,
                                       GenericTensorAccessorW const &input_grad,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f93fc4c080..a955eb0b9f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -115,10 +115,11 @@ class RequestManager {
 
   FFModel *get_model(int model_id);
 
-  GenerationResult generate_incr_decoding(FFModel *model,
-                                          std::vector<std::string> const &prompts,
-                                          int max_seq_length,
-                                          PEFTModelID peft_model_id);
+  GenerationResult
+      generate_incr_decoding(FFModel *model,
+                             std::vector<std::string> const &prompts,
+                             int max_seq_length,
+                             PEFTModelID peft_model_id);
   GenerationResult generate_spec_infer(FFModel *model,
                                        std::vector<std::string> const &prompts,
                                        int max_seq_length,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 9f3a0a4a5f..b74292ad9d 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -280,12 +280,14 @@ void FlexFlow::top_level_task(Task const *task,
       prompts.push_back(text);
       dataset.push_back(std::make_pair(text, text));
     }
-    rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/, peft_model_id);
+    rm->register_new_peft_request(
+        dataset, 256 /*max_sequence_length*/, peft_model_id);
     for (auto &prompt : prompts) {
-      GenerationResult result = model.generate(prompt, 128 /*max_sequence_length*/);
+      GenerationResult result =
+          model.generate(prompt, 128 /*max_sequence_length*/);
     }
-    //GenerationResult result =
-    //    model.generate(prompts, 128 /*max_sequence_length*/);
+    // GenerationResult result =
+    //     model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
   // Execution fence
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 383e171662..51bfb6a390 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -447,7 +447,7 @@ __host__ void
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
+        IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
@@ -1016,7 +1016,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
+        IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 3fa41cfe6d..1a30799e1d 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -406,7 +406,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -461,11 +461,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
 #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
@@ -492,7 +492,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       int n_ = num_tokens;
       int k_ = m->oProjSize;
       int lda = k_;
-      int ldb = n_;
+      int ldb = k_;
       int ldc = m_;
       float alpha = 1.0f, beta = 0.0f;
       // matrix A: output projection weight
@@ -634,18 +634,18 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                             c_param,
                                             h_param,
                                             w_param));
-      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
-                                      CUDNN_SOFTMAX_ACCURATE,
-                                      CUDNN_SOFTMAX_MODE_CHANNEL,
-                                      &alpha,
-                                      m->qk_tensor,
-                                      m->softmax_activation_buffer,
-                                      m->qk_tensor,
-                                      m->qk_prods_softmax,
-                                      &beta,
-                                      m->qk_tensor,
-                                      m->qk_prods));
-      // TODO: fill all elements above diagonal to force causal attention
+      // checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+      //                                 CUDNN_SOFTMAX_ACCURATE,
+      //                                 CUDNN_SOFTMAX_MODE_CHANNEL,
+      //                                 &alpha,
+      //                                 m->qk_tensor,
+      //                                 m->softmax_activation_buffer,
+      //                                 m->qk_tensor,
+      //                                 m->qk_prods_softmax,
+      //                                 &beta,
+      //                                 m->qk_tensor,
+      //                                 m->qk_prods));
+      //  TODO: fill all elements above diagonal to force causal attention
     }
     // Step 5: compute gradients w.r.t. key
     {
@@ -825,6 +825,24 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
   }
 }
 
+template <typename DT>
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+
+    DT qVal = devQKVProjArray[val_idx];
+
+    // query cache
+    qCache_ptr[i] = qVal;
+  }
+}
+
 template <typename DT>
 __global__ void fill_entries_above_diagonal(DT *matrix,
                                             size_t num_rows,
@@ -843,7 +861,7 @@ __global__ void fill_entries_above_diagonal(DT *matrix,
 }
 
 template <typename DT>
-void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m,
                               BatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
@@ -882,6 +900,23 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      m->query_activation_buffer = allocator->allocate_instance_untyped(
+          sizeof(DT) * total_tokens * m->num_q_heads * m->qProjSize);
+      int parallelism = m->hidden_size * num_tokens;
+      store_query_cache<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(
+          static_cast<DT *>(m->devQKVProjArray),
+          static_cast<DT *>(m->query_activation_buffer),
+          num_tokens,
+          m->hidden_size);
+    }
+
     // bc->token_last_available_idx[i] + 1;
     // Compute (QK^T/sqrt(d_k))
     // a flag of using this scaling alpha
@@ -995,6 +1030,20 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                    &softmax_beta,
                                    m->qk_tensor,
                                    C_softmax));
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      m->softmax_activation_buffer = allocator->allocate_instance_untyped(
+          sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads);
+      checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer,
+                                C_softmax,
+                                sizeof(DT) * total_tokens * num_new_tokens *
+                                    m->num_q_heads,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
+
     // Matmul softmax(QK^T/sqrt(d_k)) by V
     alpha = 1.0f, beta = 0.0f;
     m_ = m->vProjSize;
@@ -1090,7 +1139,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -1193,7 +1242,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
 
 /*static*/
 void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorW const &input_grad,
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 504380736f..2e8761472f 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -370,11 +370,12 @@ void peft_bwd_kernel(LinearMeta const *m,
   hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   // update input_grad_ptr offset
   input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens;
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = hipblas_data_type;
 #else
-  hipblasDatatype_t compute_type = HIPBLAS_R_32F;
+  // TODO: currently use the hipblas_data_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  hipblasDatatype_t compute_type = output_type;
 #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 525fdf4d11..4627179fc4 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -455,11 +455,11 @@ void peft_bwd_kernel(LinearMeta const *m,
   input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens * in_dim;
   output_grad_ptr =
       static_cast<DT *>(output_grad_ptr) + num_infr_tokens * out_dim;
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = output_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = CUDA_R_32F;
 #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index fd64c4710b..282d0efc7e 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -145,12 +145,11 @@ void inference_kernel(LoraLinearMeta *m,
   cudaDataType_t lr_actv_type = output_type;
   assert(input_type == output_type);
   cudaDataType_t weight_type = output_type;
-
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = output_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = output_type;
 #endif
   int num_peft_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
@@ -263,13 +262,12 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   assert(input_type == output_type);
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
-#if CUDA_VERSION >= 11000
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = output_type;
 #endif
-
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5631ea6523..4128fee220 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -357,10 +357,10 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request(
 
   GenerationResult gr;
   gr.guid = request.guid;
-  //gr.input_text = prompt;
-  //gr.input_tokens = request.tokens;
-  //gr.output_text = prompt;
-  //gr.output_tokens = request.tokens;
+  // gr.input_text = prompt;
+  // gr.input_tokens = request.tokens;
+  // gr.output_text = prompt;
+  // gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
   return request.guid;
 }
@@ -579,32 +579,37 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   if (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
     assert(request.dataset.size() > 0);
-    int num_peft_tokens = request.dataset[0].first.size()
-                        + request.dataset[0].second.size();
-    if (num_peft_tokens + new_bc.num_active_tokens() <= get_max_tokens_per_batch()) {
+    int num_peft_tokens =
+        request.dataset[0].first.size() + request.dataset[0].second.size();
+    if (num_peft_tokens + new_bc.num_active_tokens() <=
+        get_max_tokens_per_batch()) {
       // The last request slot is reserved for PEFT request
       int peft_req_idx = get_max_requests_per_batch() - 1;
       assert(new_bc.request_completed[peft_req_idx]);
       new_bc.request_completed[peft_req_idx] = false;
       new_bc.requestsInfo[peft_req_idx].first_token_depth_in_request = 0;
-      new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch = new_bc.num_tokens;
+      new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch =
+          new_bc.num_tokens;
       new_bc.requestsInfo[peft_req_idx].num_tokens_in_batch = num_peft_tokens;
-      new_bc.requestsInfo[peft_req_idx].max_sequence_length = request.max_sequence_length;
+      new_bc.requestsInfo[peft_req_idx].max_sequence_length =
+          request.max_sequence_length;
       new_bc.requestsInfo[peft_req_idx].request_guid = request.guid;
       new_bc.requestsInfo[peft_req_idx].peft_model_id = request.peft_model_id;
       new_bc.requestsInfo[peft_req_idx].peft_bwd = true;
       for (size_t i = 0; i < request.dataset[0].first.size(); i++) {
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].first[i];
+        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+            request.dataset[0].first[i];
         new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i;
-        new_bc.num_tokens ++;
+        new_bc.num_tokens++;
       }
       for (size_t i = 0; i < request.dataset[0].second.size(); i++) {
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].second[i];
+        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+            request.dataset[0].second[i];
         new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens;
         int depth = request.dataset[0].first.size() + i;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-        new_bc.num_tokens ++;
+        new_bc.num_tokens++;
       }
     }
   }
@@ -2041,11 +2046,11 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
 }
 
 /*static*/
-GenerationResult
-    RequestManager::generate_incr_decoding(FFModel *llm,
-                                           std::vector<std::string> const &prompts,
-                                           int max_seq_length,
-                                           PEFTModelID peft_model_id) {
+GenerationResult RequestManager::generate_incr_decoding(
+    FFModel *llm,
+    std::vector<std::string> const &prompts,
+    int max_seq_length,
+    PEFTModelID peft_model_id) {
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestGuid guid;
   for (int i = 0; i < prompts.size(); i++) {

From 45c1e0105a77299a54ed9cb812040869ca424a55 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 23 Oct 2023 21:58:03 -0400
Subject: [PATCH 043/198] uncomment softmaxbackward

---
 src/ops/inc_multihead_self_attention.cu | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 1a30799e1d..b83d23804c 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -634,17 +634,17 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                             c_param,
                                             h_param,
                                             w_param));
-      // checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
-      //                                 CUDNN_SOFTMAX_ACCURATE,
-      //                                 CUDNN_SOFTMAX_MODE_CHANNEL,
-      //                                 &alpha,
-      //                                 m->qk_tensor,
-      //                                 m->softmax_activation_buffer,
-      //                                 m->qk_tensor,
-      //                                 m->qk_prods_softmax,
-      //                                 &beta,
-      //                                 m->qk_tensor,
-      //                                 m->qk_prods));
+      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      m->qk_tensor,
+                                      m->softmax_activation_buffer,
+                                      m->qk_tensor,
+                                      m->qk_prods_softmax,
+                                      &beta,
+                                      m->qk_tensor,
+                                      m->qk_prods));
       //  TODO: fill all elements above diagonal to force causal attention
     }
     // Step 5: compute gradients w.r.t. key

From 07636e8f89ab470c2f9216be17d4ccfc444da9dc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 24 Oct 2023 15:53:29 -0400
Subject: [PATCH 044/198] add layernorm to align test

---
 tests/align/test_all_operators.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh
index 3fb361f25c..73b0cb30dc 100755
--- a/tests/align/test_all_operators.sh
+++ b/tests/align/test_all_operators.sh
@@ -11,7 +11,7 @@ function generate_torch_tensor(){
     python tests/align/align_create_tensor_torch.py -o "$1"
 }
 
-ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather)
+ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather)
 
 #create flexflow tensors
 conda activate flexflow

From 28a5e84a68b2355478530e40484741ef6dbaab3e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 24 Oct 2023 17:25:41 -0400
Subject: [PATCH 045/198] add peft test scripts

---
 tests/peft/hf_finetune.py | 120 ++++++++++++++++++++++++++++++++++++++
 tests/peft/hf_serve.py    |  51 ++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 tests/peft/hf_finetune.py
 create mode 100644 tests/peft/hf_serve.py

diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
new file mode 100644
index 0000000000..981e4b0a1f
--- /dev/null
+++ b/tests/peft/hf_finetune.py
@@ -0,0 +1,120 @@
+import os, sys
+#os.environ["CUDA_VISIBLE_DEVICES"]="0"
+import torch
+import torch.nn as nn
+#import bitsandbytes as bnb
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer
+import argparse
+from peft import LoraConfig, get_peft_model
+import transformers
+from datasets import load_dataset
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x): 
+        return super().forward(x).to(torch.float32)
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf")
+    parser.add_argument("--lora-rank", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument("--lora-dropout", type=float, default=0.05)
+    parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
+    parser.add_argument("--output-dir", type=str, default="./finetuned-llama")
+    args = parser.parse_args()
+    model_name = args.model_name
+    use_full_precision=args.use_full_precision
+    lora_rank = args.lora_rank
+    lora_alpha = args.lora_alpha
+    lora_dropout = args.lora_dropout
+    output_dir = args.output_dir
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        #load_in_8bit=True,
+        torch_dtype = torch.float32 if use_full_precision else torch.float16,
+        device_map='auto',
+    )
+
+    # Get Tokenizer
+    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=True, torch_dtype = torch.float32 if use_full_precision else torch.float16,)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype = torch.float32 if use_full_precision else torch.float16,)
+    for param in model.parameters():
+        param.requires_grad = False  # freeze the model - train adapters later
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float32)
+
+    model.gradient_checkpointing_enable()  # reduce number of stored activations
+    model.enable_input_require_grads()
+
+    model.lm_head = CastOutputToFloat(model.lm_head)
+
+    config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_alpha,
+        #target_modules=["q_proj", "v_proj"],
+        target_modules=["down_proj"],
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    print(model)
+    print(model.named_parameters())
+    model = get_peft_model(model, config)
+    print_trainable_parameters(model)
+
+    data = load_dataset("Abirate/english_quotes")
+    data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=data['train'],
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=4,
+            gradient_accumulation_steps=4,
+            warmup_steps=100,
+            max_steps=200,
+            learning_rate=2e-4,
+            fp16=True if not use_full_precision else False,
+            logging_steps=1,
+            output_dir=os.path.join(output_dir, "logs"),
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
+    )
+    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+    trainer.train()
+
+    print(f"Done fine-tuning! Saving the model to {output_dir}...")
+    model.save_pretrained(output_dir)
+
+    # Upload to HF hub
+    #from huggingface_hub import notebook_login
+    #notebook_login()
+    #model.push_to_hub("goliaro/llama-7b-lora-half", use_auth_token=True)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
new file mode 100644
index 0000000000..677ccc6eeb
--- /dev/null
+++ b/tests/peft/hf_serve.py
@@ -0,0 +1,51 @@
+import argparse
+import torch
+from peft import PeftModel, PeftConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama")
+    parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
+    parser.add_argument("--max-new-tokens", type=int, default=50)
+    args = parser.parse_args()
+    peft_model_id = args.peft_model_id
+    #peft_model_id = "goliaro/llama-7b-lora-half"
+    use_full_precision=args.use_full_precision
+    max_new_tokens = args.max_new_tokens
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    
+    config = PeftConfig.from_pretrained(peft_model_id)
+    model = AutoModelForCausalLM.from_pretrained(
+        config.base_model_name_or_path, 
+        return_dict=True, 
+        #load_in_8bit=True, 
+        torch_dtype = torch.float32 if use_full_precision else torch.float16,
+        device_map='auto',
+    )
+    hf_config = AutoConfig.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            config.base_model_name_or_path, use_fast=True, 
+            torch_dtype = torch.float32 if use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.base_model_name_or_path, 
+            torch_dtype = torch.float32 if use_full_precision else torch.float16,
+        )
+
+    # Load the Lora model
+    model = PeftModel.from_pretrained(model, peft_model_id)
+    batch = tokenizer("Two things are infinite: ", return_tensors='pt')
+    with torch.cuda.amp.autocast():
+        output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens)
+    print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))
+
+if __name__ == "__main__":
+    main()

From dd9437063fcbcd65103429f8665cb03ccc00e83a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 24 Oct 2023 17:25:57 -0400
Subject: [PATCH 046/198] fix import

---
 tests/peft/hf_serve.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 677ccc6eeb..6f3753906f 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -1,5 +1,6 @@
 import argparse
 import torch
+import os, sys
 from peft import PeftModel, PeftConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer
 

From 3c013281adf4f98881a7d2d351b3e261c1599538 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 24 Oct 2023 22:06:18 +0000
Subject: [PATCH 047/198] fix

---
 tests/peft/hf_finetune.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 981e4b0a1f..14aad1b9cc 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -62,6 +62,10 @@ def main():
         tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=True, torch_dtype = torch.float32 if use_full_precision else torch.float16,)
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype = torch.float32 if use_full_precision else torch.float16,)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+
     for param in model.parameters():
         param.requires_grad = False  # freeze the model - train adapters later
         if param.ndim == 1:

From fa56364a04c27bd86f83a1676e7f097bb27b4a79 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 26 Oct 2023 15:47:14 +0000
Subject: [PATCH 048/198] add code to convert peft models

---
 conda/flexflow.yml                        |   6 +
 python/flexflow/serve/models/base.py      |   9 +-
 python/flexflow/serve/models/falcon.py    |  30 +--
 python/flexflow/serve/models/llama.py     |  51 +++---
 python/flexflow/serve/models/mpt.py       |  27 +--
 python/flexflow/serve/models/opt.py       |  45 ++---
 python/flexflow/serve/models/starcoder.py |  14 +-
 python/flexflow/serve/serve.py            | 213 ++++++++++++++++++----
 requirements.txt                          |   7 +
 9 files changed, 292 insertions(+), 110 deletions(-)

diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index c9226269f2..3e39407bfa 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -25,3 +25,9 @@ dependencies:
     - sentencepiece
     - einops
     - requests
+    - scipy
+    - bitsandbytes
+    - datasets
+    - accelerate
+    - loralib
+    - peft
diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index 025008ec78..17bb894250 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -21,9 +21,9 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        #max_batch_size=1,
-        #max_seq_length=256,
-        #max_tokens_per_batch=64,
+        # max_batch_size=1,
+        # max_seq_length=256,
+        # max_tokens_per_batch=64,
         weights_filepath="",
         tokenizer_filepath="",
     ):
@@ -32,5 +32,8 @@ def __init__(
     def build_model(self):
         assert False, "Not implemented yet"
 
+    def convert_hf_weight_name(name):
+        assert False, "Not implemented yet"
+
     def convert_hf_model(model, dst_folder):
         assert False, "Not implemented yet"
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 2b114f09b3..eafce814e1 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -19,8 +19,8 @@
 
 class FalconConfig:
     def __init__(self, hf_config):
-        #self.max_seq_len = 256
-        #self.max_num_tokens = 64
+        # self.max_seq_len = 256
+        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.bias = hf_config.bias
@@ -53,8 +53,8 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        #max_batch_size=1,
-        #max_seq_length=256,
+        # max_batch_size=1,
+        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -62,11 +62,11 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        #self.max_batch_size = max_batch_size
+        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.falcon_config = FalconConfig(hf_config)
-        #self.falcon_config.max_seq_length = max_seq_length
-        #self.falcon_config.max_num_tokens = max_tokens_per_batch
+        # self.falcon_config.max_seq_length = max_seq_length
+        # self.falcon_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -235,6 +235,15 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (
+            name.replace(".", "_")
+            .replace("transformer_h_", "layers_")
+            .replace("transformer_", "")
+            .replace("self_attention_dense", "attention_wo")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         n_head = (
@@ -243,12 +252,7 @@ def convert_hf_model(model, dst_folder):
             else model.config.num_attention_heads
         )
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("transformer_h_", "layers_")
-                .replace("transformer_", "")
-                .replace("self_attention_dense", "attention_wo")
-            )
+            name = FlexFlowFalcon.convert_hf_weight_name(name)
             # Split Q,K,V attention weights
             if "self_attention_query_key_value" in name:
                 name_q = name.replace("self_attention_query_key_value", "attention_wq")
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 7ba0e78a37..ba5f1df7a2 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -19,8 +19,8 @@
 
 class LLAMAConfig:
     def __init__(self, hf_config):
-        #self.max_seq_len = 256
-        #self.max_num_tokens = 64
+        # self.max_seq_len = 256
+        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.num_hidden_layers = hf_config.num_hidden_layers
@@ -45,8 +45,8 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        #max_batch_size=1,
-        #max_seq_length=256,
+        # max_batch_size=1,
+        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -54,11 +54,11 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        #self.max_batch_size = max_batch_size
+        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.llama_config = LLAMAConfig(hf_config)
-        #self.llama_config.max_seq_length = max_seq_length
-        #self.llama_config.max_num_tokens = max_tokens_per_batch
+        # self.llama_config.max_seq_length = max_seq_length
+        # self.llama_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -242,24 +242,27 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return (
+            name.replace(".", "_")
+            .replace("self_attn", "attention")
+            .replace("q_proj", "wq")
+            .replace("k_proj", "wk")
+            .replace("v_proj", "wv")
+            .replace("o_proj", "wo")
+            .replace("mlp", "feed_forward")
+            .replace("gate_proj", "w1")
+            .replace("down_proj", "w2")
+            .replace("up_proj", "w3")
+            .replace("input_layernorm", "attention_norm")
+            .replace("post_attention_layernorm", "ffn_norm")
+            .replace("embed_tokens", "tok_embeddings")
+            .replace("lm_head", "output")
+            .replace("model_", "")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("self_attn", "attention")
-                .replace("q_proj", "wq")
-                .replace("k_proj", "wk")
-                .replace("v_proj", "wv")
-                .replace("o_proj", "wo")
-                .replace("mlp", "feed_forward")
-                .replace("gate_proj", "w1")
-                .replace("down_proj", "w2")
-                .replace("up_proj", "w3")
-                .replace("input_layernorm", "attention_norm")
-                .replace("post_attention_layernorm", "ffn_norm")
-                .replace("embed_tokens", "tok_embeddings")
-                .replace("lm_head", "output")
-                .replace("model_", "")
-            )
+            name = FlexFlowLLAMA.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 79a5bb940f..91d87669ca 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -19,8 +19,8 @@
 
 class MPTConfig:
     def __init__(self, hf_config):
-        #self.max_seq_len = 256
-        #self.max_num_tokens = 64
+        # self.max_seq_len = 256
+        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.hidden_size = hf_config.d_model
@@ -40,8 +40,8 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        #max_batch_size=1,
-        #max_seq_length=256,
+        # max_batch_size=1,
+        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -49,11 +49,11 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        #self.max_batch_size = max_batch_size
+        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.mpt_config = MPTConfig(hf_config)
-        #self.mpt_config.max_seq_length = max_seq_length
-        #self.mpt_config.max_num_tokens = max_tokens_per_batch
+        # self.mpt_config.max_seq_length = max_seq_length
+        # self.mpt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -245,10 +245,18 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("transformer.blocks.", "layers.")
+            .replace(".", "_")
+            .replace("attn_out_proj", "attention_wo")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.blocks.", "layers.").replace(".", "_")
+            name = FlexFlowMPT.convert_hf_weight_name(name)
             if "Wqkv" in name:
                 name_q = name.replace("attn_Wqkv", "attention_wq")
                 name_k = name.replace("attn_Wqkv", "attention_wk")
@@ -265,9 +273,6 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "out_proj" in name:
-                name = name.replace("attn_out_proj", "attention_wo")
-                params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index dfd1cde7d4..8250c63a9a 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -19,8 +19,8 @@
 
 class OPTConfig:
     def __init__(self, hf_config):
-        #self.max_seq_len = 256
-        #self.max_num_tokens = 64
+        # self.max_seq_len = 256
+        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.do_layer_norm_before = hf_config.do_layer_norm_before
@@ -46,8 +46,8 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        #max_batch_size=1,
-        #max_seq_length=256,
+        # max_batch_size=1,
+        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -55,11 +55,11 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        #self.max_batch_size = max_batch_size
+        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.opt_config = OPTConfig(hf_config)
-        #self.opt_config.max_seq_length = max_seq_length
-        #self.opt_config.max_num_tokens = max_tokens_per_batch
+        # self.opt_config.max_seq_length = max_seq_length
+        # self.opt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -276,23 +276,26 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return (
+            name.replace(".", "_")
+            .replace("decoder_", "")
+            .replace("model_", "")
+            .replace("self_attn", "attention")
+            .replace("q_proj", "wq")
+            .replace("k_proj", "wk")
+            .replace("v_proj", "wv")
+            .replace("out_proj", "wo")
+            .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias")
+            .replace(
+                "_final_layer_norm", "_add_bias_residual_layer_norm"
+            )  # important to use the leading "_" to avoid matching the last LayerNorm
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("decoder_", "")
-                .replace("model_", "")
-                .replace("self_attn", "attention")
-                .replace("q_proj", "wq")
-                .replace("k_proj", "wk")
-                .replace("v_proj", "wv")
-                .replace("out_proj", "wo")
-                .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias")
-                .replace(
-                    "_final_layer_norm", "_add_bias_residual_layer_norm"
-                )  # important to use the leading "_" to avoid matching the last LayerNorm
-            )
+            name = FlexFlowOPT.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index f4f28a70e1..0f577299ed 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -19,8 +19,8 @@
 
 class STARCODERConfig:
     def __init__(self, hf_config):
-        #self.max_seq_len = 256
-        #self.max_num_tokens = 64
+        # self.max_seq_len = 256
+        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.dropout_p = hf_config.attn_pdrop
@@ -44,8 +44,8 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        #max_batch_size=1,
-        #max_seq_length=256,
+        # max_batch_size=1,
+        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -53,11 +53,11 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        #self.max_batch_size = max_batch_size
+        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.starcoder_config = STARCODERConfig(hf_config)
-        #self.starcoder_config.max_seq_length = max_seq_length
-        #self.starcoder_config.max_num_tokens = max_tokens_per_batch
+        # self.starcoder_config.max_seq_length = max_seq_length
+        # self.starcoder_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 549677d77a..1c9ece27ef 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -28,8 +28,9 @@
 )
 from flexflow.core import *
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
+from peft import PeftModel, PeftConfig
 from huggingface_hub import HfApi
-import sys, torch, shutil, hashlib
+import sys, torch, shutil, hashlib, json
 from typing import Union, List
 
 
@@ -68,6 +69,36 @@ def __init__(self, text: str = None, tokens: list = None):
         self.output_tokens = tokens
 
 
+class _SupportedModels:
+    def __init__(
+        self,
+    ):
+        self.supported_models = {
+            "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
+            "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
+            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
+            "GPTBigCodeForCausalLM": (
+                ModelType.STARCODER,
+                FlexFlowSTARCODER,
+                STARCODERConfig,
+            ),
+            "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig),
+        }
+
+    def get_ff_model_type(self, hf_config):
+        architectures = getattr(hf_config, "architectures", [])
+        ff_arch = None
+        if next(iter(architectures), None) is not None:
+            ff_arch = self.supported_models.get(architectures[0])
+        if ff_arch is None:
+            raise ValueError(
+                f"Huggingface model of type {architectures} is not yet supported by FlexFlow"
+            )
+        return ff_arch
+
+
 class LLM:
     """This class creates a LLM (Large-Language Model) object based on a model from HuggingFace"""
 
@@ -92,44 +123,20 @@ def __init__(
         :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to ""
         :type output_file: str, optional
         """
-        self.supported_models = {
-            "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
-            "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
-            "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
-            "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
-            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
-            "GPTBigCodeForCausalLM": (
-                ModelType.STARCODER,
-                FlexFlowSTARCODER,
-                STARCODERConfig,
-            ),
-            "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig),
-        }
+        self.supported_models = _SupportedModels()
         self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         self.model_name = self.hf_config._name_or_path
         (
             self.model_type,
             self.model_class,
             self.config_class,
-        ) = self.__get_ff_model_type()
+        ) = self.supported_models.get_ff_model_type(self.hf_config)
         self.data_type = data_type
         assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         self.output_file = output_file
 
-    def __get_ff_model_type(self):
-        architectures = getattr(self.hf_config, "architectures", [])
-        ff_arch = None
-        if next(iter(architectures), None) is not None:
-            ff_arch = self.supported_models.get(architectures[0])
-        if ff_arch is None:
-            print(
-                f"Huggingface model of type {architectures} is not yet supported by FlexFlow"
-            )
-            sys.exit(1)
-        return ff_arch
-
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
         self.config_dir = os.path.join(
@@ -334,9 +341,9 @@ def compile(
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
-        #self.max_requests_per_batch = max_requests_per_batch
-        #self.max_seq_length = max_seq_length
-        #self.max_tokens_per_batch = max_tokens_per_batch
+        # self.max_requests_per_batch = max_requests_per_batch
+        # self.max_seq_length = max_seq_length
+        # self.max_tokens_per_batch = max_tokens_per_batch
         self.ssms = ssms
         self.generation_config = GenerationConfig()
         self.ffconfig = FFConfig()
@@ -376,7 +383,7 @@ def compile(
             self.ffconfig,
             self.hf_config,
             self.data_type,
-            max_tokens_per_batch
+            max_tokens_per_batch,
         )
 
         # Create inference manager
@@ -500,3 +507,147 @@ def compile(
             model_specific_pipeline_parallelism_degree,
             ssms,
         )
+
+
+class PEFT:
+    """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM"""
+
+    def __init__(
+        self,
+        peft_model_id: str,
+        data_type: DataType = DataType.DT_HALF,
+        cache_path: str = "",
+        refresh_cache: bool = False,
+    ):
+        self.hf_config = PeftConfig.from_pretrained(peft_model_id)
+        self.peft_model_id = peft_model_id
+        self.peft_type: self.hf_config.peft_type
+        if self.peft_type != "LORA":
+            raise RuntimeError(
+                f"PEFT type {self.peft_type} not yet supported in FlexFlow"
+            )
+        self.data_type = data_type
+        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
+        self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
+        self.refresh_cache = refresh_cache
+        # Base model related
+        self.supported_base_models = _SupportedModels()
+        if "base_model_name_or_path" not in self.hf_config.to_dict():
+            raise ValueError(
+                f"PEFT model {peft_model_id} does not have an associated based model"
+            )
+        self.base_model_hf_config = AutoConfig.from_pretrained(
+            self.hf_config.base_model_name_or_path, trust_remote_code=True
+        )
+        (
+            self.base_model_type,
+            self.base_model_class,
+            self.base_config_class,
+        ) = self.supported_base_models.get_ff_model_type(self.base_model_hf_config)
+
+    def download_hf_config(self):
+        """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
+        self.config_dir = os.path.join(
+            os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()
+        )
+        self.config_path = os.path.join(self.config_dir, "config.json")
+        os.makedirs(self.config_dir, exist_ok=True)
+        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
+        print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
+        with open(self.config_path, "w") as json_file:
+            json.dump(self.hf_config.to_dict(), json_file, indentation=2)
+
+    def __get_revision_hashes(self, peft_model_id: str):
+        ff_revision = None
+        ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt")
+        if os.path.exists(ff_revision_file):
+            ff_revision = "".join(open(ff_revision_file).read().split())
+
+        if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id):
+            # Local model
+            files = os.listdir(peft_model_id)
+            state = files + [
+                os.path.getmtime(os.path.join(peft_model_id, f)) for f in files
+            ]
+            latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
+        else:
+            # Remote HuggingFace model
+            hf_api = HfApi()
+            latest_revision = hf_api.model_info(self.peft_model_id).sha
+        return ff_revision, ff_revision_file, latest_revision
+
+    def convert_peft_model(self, hf_peft_model, weights_path):
+        for name, params in hf_peft_model.named_parameters():
+            name = name.replace("base_model.model.model.", "").replace(".default", "")
+            name = self.base_model_class.convert_hf_weight_name(name)
+            params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+
+    def download_hf_weights_if_needed(self):
+        """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
+        If not, or if the refresh_cache parameter is set to True, download new weights.
+        """
+        if self.data_type == DataType.DT_HALF:
+            torch.set_default_tensor_type(torch.HalfTensor)
+        elif self.data_type == DataType.DT_FLOAT:
+            torch.set_default_tensor_type(torch.FloatTensor)
+        else:
+            assert False, "Data type not yet supported -- cannot download weights!"
+
+        # Use local cache, or download new version
+        self.weights_path = os.path.join(
+            os.path.expanduser(self.cache_path),
+            "weights",
+            self.peft_model_id.lower(),
+            "full-precision"
+            if self.data_type == DataType.DT_FLOAT
+            else "half-precision",
+        )
+        if self.refresh_cache:
+            print(
+                f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..."
+            )
+            if os.path.exists(self.weights_path):
+                shutil.rmtree(self.weights_path)
+        os.makedirs(self.weights_path, exist_ok=True)
+        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
+
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+            self.peft_model_id
+        )
+
+        # Download if needed
+        if ff_revision != latest_revision:
+            if not os.path.exists(self.peft_model_id) or os.path.isdir(
+                self.peft_model_id
+            ):
+                # Local model
+                print(
+                    f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
+                )
+            else:
+                # Remote model
+                print(
+                    f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
+                )
+            # Download model from HuggingFace, or load it from the local folder
+            hf_model = AutoModelForCausalLM.from_pretrained(
+                self.hf_config.base_model_name_or_path,
+                return_dict=True,
+                trust_remote_code=True,
+                torch_dtype=torch.float32 if use_full_precision else torch.float16,
+                device_map="auto",
+            )
+            hf_peft_model = PeftModel.from_pretrained(hf_model, self.peft_model_id)
+            # Print log message to notify user download of model has finished
+            if not os.path.exists(self.peft_model_id) or os.path.isdir(
+                self.peft_model_id
+            ):
+                print("Done downloading HF weights. Converting them now...")
+            # Convert the model to FlexFlow format
+            self.__convert_peft_model(hf_peft_model, self.weights_path)
+            # Save new revision hash to file
+            with open(ff_revision_file, "w+") as f:
+                f.write(latest_revision)
+            print("Done converting the weights...")
+        else:
+            print(f"Loading '{self.peft_model_id}' model weights from the cache...")
diff --git a/requirements.txt b/requirements.txt
index 1037661337..43df6a2975 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,10 @@ onnx
 transformers>=4.31.0
 sentencepiece
 einops
+# peft-related
+scipy
+bitsandbytes 
+datasets 
+accelerate 
+loralib
+peft

From a4841008d0532c0d8f719339cef560134b1087a2 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 26 Oct 2023 16:15:47 +0000
Subject: [PATCH 049/198] add script to download peft for c++, fix bug

---
 inference/utils/download_peft_model.py | 59 ++++++++++++++++++++++++++
 python/flexflow/serve/serve.py         |  2 +-
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 inference/utils/download_peft_model.py

diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
new file mode 100644
index 0000000000..1204634388
--- /dev/null
+++ b/inference/utils/download_peft_model.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+import flexflow.serve as ff
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download"
+    )
+    parser.add_argument(
+        "--cache-folder",
+        type=str,
+        help="Folder to use to store the model(s) assets in FlexFlow format",
+        default="",
+    )
+    parser.add_argument(
+        "--refresh-cache",
+        action="store_true",
+        help="Use this flag to force the refresh of the model(s) weights/tokenizer cache",
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--full-precision-only",
+        action="store_true",
+        help="Only download the full precision version of the weights",
+    )
+    group.add_argument(
+        "--half-precision-only",
+        action="store_true",
+        help="Only download the half precision version of the weights",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    if args.full_precision_only:
+        data_types = ff.DataType.DT_FLOAT
+    elif args.half_precision_only:
+        data_types = ff.DataType.DT_HALF
+    else:
+        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
+
+    for peft_model_id in args.peft_model_ids:
+        for data_type in data_types:
+            peft = ff.PEFT(
+                peft_model_id,
+                data_type=data_type,
+                cache_path=args.cache_folder,
+                refresh_cache=args.refresh_cache,
+            )
+            peft.download_hf_weights_if_needed()
+            peft.download_hf_config()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 1c9ece27ef..19f7f089b7 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -634,7 +634,7 @@ def download_hf_weights_if_needed(self):
                 self.hf_config.base_model_name_or_path,
                 return_dict=True,
                 trust_remote_code=True,
-                torch_dtype=torch.float32 if use_full_precision else torch.float16,
+                torch_dtype=torch.float32 if self.data_type == DataType.DT_FLOAT else torch.float16,
                 device_map="auto",
             )
             hf_peft_model = PeftModel.from_pretrained(hf_model, self.peft_model_id)

From c83c376ccfc6fb6a322836ba56bb6da006b4323b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 26 Oct 2023 17:05:37 +0000
Subject: [PATCH 050/198] fix

---
 python/flexflow/serve/__init__.py |  2 +-
 python/flexflow/serve/serve.py    | 71 ++++++++++++++++---------------
 2 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index cf467280bd..274b431ad8 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -15,7 +15,7 @@
 from typing import Optional
 from ..type import *
 from flexflow.core import *
-from .serve import LLM, SSM, GenerationConfig, GenerationResult
+from .serve import LLM, SSM, PEFT, GenerationConfig, GenerationResult
 
 
 def __check_positive_int(configs_dict: dict, key: str):
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 19f7f089b7..e0e1b2e155 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -30,7 +30,7 @@
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
 from peft import PeftModel, PeftConfig
 from huggingface_hub import HfApi
-import sys, torch, shutil, hashlib, json
+import torch, shutil, hashlib, json, gc
 from typing import Union, List
 
 
@@ -175,13 +175,6 @@ def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
         """
-        if self.data_type == DataType.DT_HALF:
-            torch.set_default_tensor_type(torch.HalfTensor)
-        elif self.data_type == DataType.DT_FLOAT:
-            torch.set_default_tensor_type(torch.FloatTensor)
-        else:
-            assert False, "Data type not yet supported -- cannot download weights!"
-
         # Use local cache, or download new version
         self.weights_path = os.path.join(
             os.path.expanduser(self.cache_path),
@@ -218,7 +211,11 @@ def download_hf_weights_if_needed(self):
                 )
             # Download model from HuggingFace, or load it from the local folder
             hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, trust_remote_code=True
+                self.model_name,
+                trust_remote_code=True,
+                torch_dtype=torch.float32
+                if self.data_type == DataType.DT_FLOAT
+                else torch.float16,
             )
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
@@ -229,6 +226,10 @@ def download_hf_weights_if_needed(self):
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
             print("Done converting the weights...")
+            # Deallocate hf model
+            del hf_model
+            gc.collect()
+            torch.cuda.empty_cache()
         else:
             print(f"Loading '{self.model_name}' model weights from the cache...")
 
@@ -521,7 +522,7 @@ def __init__(
     ):
         self.hf_config = PeftConfig.from_pretrained(peft_model_id)
         self.peft_model_id = peft_model_id
-        self.peft_type: self.hf_config.peft_type
+        self.peft_type = self.hf_config.peft_type
         if self.peft_type != "LORA":
             raise RuntimeError(
                 f"PEFT type {self.peft_type} not yet supported in FlexFlow"
@@ -531,19 +532,13 @@ def __init__(
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         # Base model related
-        self.supported_base_models = _SupportedModels()
         if "base_model_name_or_path" not in self.hf_config.to_dict():
             raise ValueError(
                 f"PEFT model {peft_model_id} does not have an associated based model"
             )
-        self.base_model_hf_config = AutoConfig.from_pretrained(
-            self.hf_config.base_model_name_or_path, trust_remote_code=True
+        self.base_model = LLM(
+            self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache
         )
-        (
-            self.base_model_type,
-            self.base_model_class,
-            self.base_config_class,
-        ) = self.supported_base_models.get_ff_model_type(self.base_model_hf_config)
 
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
@@ -555,7 +550,7 @@ def download_hf_config(self):
         print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
         print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
         with open(self.config_path, "w") as json_file:
-            json.dump(self.hf_config.to_dict(), json_file, indentation=2)
+            json.dump(self.hf_config.to_dict(), json_file, indent=2)
 
     def __get_revision_hashes(self, peft_model_id: str):
         ff_revision = None
@@ -578,21 +573,17 @@ def __get_revision_hashes(self, peft_model_id: str):
 
     def convert_peft_model(self, hf_peft_model, weights_path):
         for name, params in hf_peft_model.named_parameters():
-            name = name.replace("base_model.model.model.", "").replace(".default", "")
-            name = self.base_model_class.convert_hf_weight_name(name)
-            params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+            if self.peft_type.lower() in name:
+                name = name.replace("base_model.model.model.", "").replace(
+                    ".default", ""
+                )
+                name = self.base_model.model_class.convert_hf_weight_name(name)
+                params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
 
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
         """
-        if self.data_type == DataType.DT_HALF:
-            torch.set_default_tensor_type(torch.HalfTensor)
-        elif self.data_type == DataType.DT_FLOAT:
-            torch.set_default_tensor_type(torch.FloatTensor)
-        else:
-            assert False, "Data type not yet supported -- cannot download weights!"
-
         # Use local cache, or download new version
         self.weights_path = os.path.join(
             os.path.expanduser(self.cache_path),
@@ -629,25 +620,35 @@ def download_hf_weights_if_needed(self):
                 print(
                     f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
                 )
-            # Download model from HuggingFace, or load it from the local folder
-            hf_model = AutoModelForCausalLM.from_pretrained(
+            # Download base model from HuggingFace, or load it from the local folder
+            self.base_model.download_hf_weights_if_needed()
+            self.base_model.download_hf_tokenizer_if_needed()
+            self.base_model.download_hf_config()
+            hf_base_model = AutoModelForCausalLM.from_pretrained(
                 self.hf_config.base_model_name_or_path,
                 return_dict=True,
                 trust_remote_code=True,
-                torch_dtype=torch.float32 if self.data_type == DataType.DT_FLOAT else torch.float16,
-                device_map="auto",
+                torch_dtype=torch.float32
+                if self.data_type == DataType.DT_FLOAT
+                else torch.float16,
+                # device_map="auto",
             )
-            hf_peft_model = PeftModel.from_pretrained(hf_model, self.peft_model_id)
+            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id)
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.peft_model_id) or os.path.isdir(
                 self.peft_model_id
             ):
                 print("Done downloading HF weights. Converting them now...")
             # Convert the model to FlexFlow format
-            self.__convert_peft_model(hf_peft_model, self.weights_path)
+            self.convert_peft_model(hf_peft_model, self.weights_path)
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
             print("Done converting the weights...")
+            # Deallocate hf model
+            del hf_peft_model
+            del hf_base_model
+            gc.collect()
+            torch.cuda.empty_cache()
         else:
             print(f"Loading '{self.peft_model_id}' model weights from the cache...")

From aa9f0046f74af4368c5d9a715549e906861f9b03 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 27 Oct 2023 05:08:19 +0000
Subject: [PATCH 051/198] add script to fine-tune models

---
 .gitignore                |  4 +++-
 tests/peft/fine_tune.sh   | 19 +++++++++++++++++++
 tests/peft/hf_finetune.py | 27 +++++++++++++++++----------
 3 files changed, 39 insertions(+), 11 deletions(-)
 create mode 100755 tests/peft/fine_tune.sh

diff --git a/.gitignore b/.gitignore
index 8fcc105f01..a032f80f77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -186,4 +186,6 @@ gpt_tokenizer
 # pip version
 python/flexflow/version.txt
 
-inference_tensors
\ No newline at end of file
+inference_tensors
+
+Untitled-1.ipynb
\ No newline at end of file
diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh
new file mode 100755
index 0000000000..dbcdb849fa
--- /dev/null
+++ b/tests/peft/fine_tune.sh
@@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+set -e
+set -x
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full
+python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half
+python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
+python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
+
+python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full
+python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half
+
+python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full
+python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half
+python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full
+python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 14aad1b9cc..d702d23038 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -32,16 +32,22 @@ def main():
     parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf")
     parser.add_argument("--lora-rank", type=int, default=16)
     parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target")
     parser.add_argument("--lora-dropout", type=float, default=0.05)
     parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
-    parser.add_argument("--output-dir", type=str, default="./finetuned-llama")
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--publish-peft-with-id", type=str, default="")
     args = parser.parse_args()
     model_name = args.model_name
     use_full_precision=args.use_full_precision
     lora_rank = args.lora_rank
     lora_alpha = args.lora_alpha
+    lora_target_modules = args.lora_target_modules.split(",")
     lora_dropout = args.lora_dropout
     output_dir = args.output_dir
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
+        raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model")
 
     # Change working dir to folder storing this script
     abspath = os.path.abspath(__file__)
@@ -81,7 +87,8 @@ def main():
         r=lora_rank,
         lora_alpha=lora_alpha,
         #target_modules=["q_proj", "v_proj"],
-        target_modules=["down_proj"],
+        #target_modules=["down_proj"],
+        target_modules=lora_target_modules,
         lora_dropout=lora_dropout,
         bias="none",
         task_type="CAUSAL_LM"
@@ -105,20 +112,20 @@ def main():
             learning_rate=2e-4,
             fp16=True if not use_full_precision else False,
             logging_steps=1,
-            output_dir=os.path.join(output_dir, "logs"),
+            output_dir=os.path.join(output_dir if len(output_dir) > 0 else "./", "lora_training_logs"),
         ),
         data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
     )
     model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
     trainer.train()
 
-    print(f"Done fine-tuning! Saving the model to {output_dir}...")
-    model.save_pretrained(output_dir)
-
-    # Upload to HF hub
-    #from huggingface_hub import notebook_login
-    #notebook_login()
-    #model.push_to_hub("goliaro/llama-7b-lora-half", use_auth_token=True)
+    if len(output_dir) > 0:
+        print(f"Done fine-tuning! Saving the model to {output_dir}...")
+        model.save_pretrained(output_dir)
+    
+    if len(publish_peft_with_id) > 0:
+        print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...")
+        model.push_to_hub(publish_peft_with_id, use_auth_token=True)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 4609e9e33aec98c34ba9bae71d8e9141e46e2a89 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 31 Oct 2023 02:46:36 +0000
Subject: [PATCH 052/198] implement loading lora configs/weights from file

---
 include/flexflow/ops/lora_linear_params.h |  13 +++
 inference/incr_decoding/incr_decoding.cc  |  16 +++-
 inference/models/llama.cc                 |   7 +-
 src/ops/lora_linear.cc                    | 105 ++++++++++++++++++++--
 src/ops/lora_linear_params.cc             |  54 ++++++++++-
 src/ops/rms_norm.cc                       |   4 +-
 src/parallel_ops/combine.cc               |   3 +
 7 files changed, 188 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 46ee4ac6b7..e82243fd67 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -3,6 +3,7 @@
 
 #include "flexflow/ffconst.h"
 #include "flexflow/fftype.h"
+#include "flexflow/inference.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/operator.h"
 #include "flexflow/parallel_tensor.h"
@@ -16,19 +17,31 @@ class LoraLinearConfig {
   LoraLinearConfig(int rank,
                    OptimizerType type = OPTIMIZER_TYPE_SGD,
                    float learning_rate = 1e-4);
+  LoraLinearConfig(std::string const &cache_folder_,
+                   std::string const &peft_model_id_);
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraLinearConfig const &llc);
 
 public:
   int rank;
   OptimizerType optimizer_type;
   float learning_rate;
+  std::string cache_folder;
+  // Huggingface
+  std::string peft_model_id;
+  int lora_alpha;
+  float lora_dropout;
+  // whether to load weights from file, instead of initializing them randomly
+  bool load_weights_from_file;
 };
 
 class LoraLinearParams {
 public:
   LayerID layer_guid;
   OperatorType type;
+  char name[MAX_OPNAME];
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
                     &input_shape) const;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index b74292ad9d..0017fe3fcb 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -39,6 +39,7 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       std::string &llm_model_name,
+                      std::string &peft_model_name,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
@@ -56,6 +57,13 @@ void parse_input_args(char **argv,
       }
       continue;
     }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
     // cache folder
     if (!strcmp(argv[i], "-cache-folder")) {
       paths.cache_folder_path = std::string(argv[++i]);
@@ -124,7 +132,7 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "Doesn't support quantization in non-offload mode");
   }
   FilePaths file_paths;
-  std::string llm_model_name;
+  std::string llm_model_name, peft_model_name;
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
@@ -141,6 +149,7 @@ void FlexFlow::top_level_task(Task const *task,
                    argc,
                    file_paths,
                    llm_model_name,
+                   peft_model_name,
                    use_full_precision,
                    verbose,
                    do_sample,
@@ -258,7 +267,10 @@ void FlexFlow::top_level_task(Task const *task,
   }
 
   // Register PEFT layer
-  LoraLinearConfig mlp_second(4 /*rank*/);
+  LoraLinearConfig mlp_second =
+      peft_model_name.empty()
+          ? LoraLinearConfig::DefaultConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
   PEFTModelID peft_model_id = model.register_peft_model(
       LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/);
 
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 72641161d1..9950d5b080 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -220,7 +220,12 @@ void LLAMA::create_llama_model(FFModel &ff,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(multi, w2, OP_LORA_MLP_SECOND);
+    ff.lora_linear(
+        multi,
+        w2,
+        OP_LORA_MLP_SECOND,
+        std::string("layers_" + std::to_string(i) + "_feed_forward_w2_lora")
+            .c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 2e356f7531..3d2d8d6106 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -6,6 +6,11 @@
 #include "flexflow/utils/hash_utils.h"
 #include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
 
 namespace FlexFlow {
 
@@ -86,7 +91,7 @@ LoraLinear::LoraLinear(FFModel &model,
                  params.type,
                  inputs.first,
                  inputs.second,
-                 name) {}
+                 params.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        LayerID const &_layer_guid,
@@ -259,6 +264,34 @@ void LoraLinear::register_peft_model(
   fm.wait_all_results();
 }
 
+template <typename DT>
+void load_peft_from_file(DT *ptr,
+                         size_t size,
+                         int shard_id,
+                         std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+  std::vector<DT> host_array(size);
+  size_t target_data_size = sizeof(DT) * size;
+  in.seekg(shard_id * target_data_size, in.beg);
+  in.read((char *)host_array.data(), target_data_size);
+
+  size_t in_get_size = in.gcount();
+  if (in_get_size != target_data_size) {
+    printf("load weight data error: %lu, %lu, %lu\n",
+           in_get_size,
+           target_data_size,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(size == host_array.size());
+  copy_kernel(ptr, host_array.data(), target_data_size);
+  in.close();
+}
+
 void LoraLinear::register_model_task(Task const *task,
                                      std::vector<PhysicalRegion> const &regions,
                                      Context ctx,
@@ -267,10 +300,16 @@ void LoraLinear::register_model_task(Task const *task,
       static_cast<LoraLinearRegisterInfo const *>(task->args);
   LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
   LoraLinear const *lora = info->lora;
+
+  int shard_id = task->index_point.point_data[0];
+
   int rank = info->lora_config.rank;
   int num_dims = lora->inputs[0]->num_dims;
   int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
   int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
+  int w0_num_elements = rank * in_dim;
+  int w1_num_elements = rank * out_dim;
+
   DataType dt = m->input_type[0];
   assert(dt == m->input_type[1]);
   assert(dt == m->output_type[0]);
@@ -278,29 +317,71 @@ void LoraLinear::register_model_task(Task const *task,
   assert(dt == lora->inputs[1]->data_type);
   assert(dt == lora->outputs[0]->data_type);
   assert(m->model_weights.find(info->model_id) == m->model_weights.end());
+
   LoraLinearWeight weight;
   PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
   weight.w0_ptr = allocator->allocate_local_weights_untyped(
-      info->model_id, rank * in_dim * data_type_size(dt));
+      info->model_id, w0_num_elements * data_type_size(dt));
   weight.w1_ptr = allocator->allocate_local_weights_untyped(
-      info->model_id, rank * out_dim * data_type_size(dt));
+      info->model_id, w1_num_elements * data_type_size(dt));
+
+  // get layer name
+  assert(lora->name != nullptr &&
+         "Layer name is not set, cannot determine weights location");
+  std::string lora_layername = std::string(lora->name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+
+  // load weights from file
+  std::string weights_folder_filepath = join_path({
+      info->lora_config.cache_folder,
+      "weights",
+      info->lora_config.peft_model_id,
+      dt == DT_FLOAT ? "full-precision" : "half-precision",
+  });
+  std::string w0_filepath =
+      join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"});
+  std::string w1_filepath =
+      join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"});
+  if (dt == DT_FLOAT) {
+    load_peft_from_file(
+        (float *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath);
+    load_peft_from_file(
+        (float *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath);
+  } else if (dt == DT_HALF) {
+    load_peft_from_file(
+        (half *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath);
+    load_peft_from_file(
+        (half *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath);
+  } else {
+    assert(false && "Data type not supported");
+  }
+
   weight.rank = rank;
   if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
     // Input is partitioned (no replication)
     // w0_grad is local weight gradients
     weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
-        info->model_id, rank * in_dim * data_type_size(dt));
+        info->model_id, w0_num_elements * data_type_size(dt));
     // w1_grad is sync weight gradients
     weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
-        info->model_id, rank * out_dim * data_type_size(dt));
+        info->model_id, w1_num_elements * data_type_size(dt));
   } else {
     // Input is replicated
     // w0_grad is sync weight gradients
     weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
-        info->model_id, rank * in_dim * data_type_size(dt));
+        info->model_id, w0_num_elements * data_type_size(dt));
     // w1_grad is local weight gradients
     weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
-        info->model_id, rank * out_dim * data_type_size(dt));
+        info->model_id, w1_num_elements * data_type_size(dt));
   }
   m->model_weights[info->model_id] = weight;
 }
@@ -483,6 +564,8 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->op_type);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 /* static */
@@ -494,15 +577,20 @@ Node LoraLinear::deserialize(FFModel &ff,
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
   OperatorType op_type;
+  size_t name_len;
+  char name[MAX_OPNAME];
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
   dez.deserialize(op_type);
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
   LoraLinearParams params;
   params.layer_guid = layer_guid;
   params.type = op_type;
+  strcpy(params.name, name);
   return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
 }
 
@@ -517,6 +605,9 @@ LoraLinearParams LoraLinear::get_params() const {
   LoraLinearParams params;
   params.layer_guid = this->layer_guid;
   params.type = this->op_type;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 80e7c6d64e..9d797aaed2 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -1,13 +1,50 @@
 #include "flexflow/ops/lora_linear_params.h"
+#include <fstream>
+#include <nlohmann/json.hpp>
+#include <string>
+using json = nlohmann::json;
 
 namespace FlexFlow {
 const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig();
 
 LoraLinearConfig::LoraLinearConfig()
-    : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f) {}
+    : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f),
+      cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
+      load_weights_from_file(false) {}
 
 LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr)
-    : rank(_rank), optimizer_type(_type), learning_rate(_lr) {}
+    : rank(_rank), optimizer_type(_type), learning_rate(_lr), cache_folder(""),
+      peft_model_id(""), lora_alpha(0), lora_dropout(0.0f),
+      load_weights_from_file(false) {}
+
+LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
+                                   std::string const &peft_model_id_) {
+  cache_folder = cache_folder_;
+  peft_model_id = peft_model_id_;
+  std::string peft_inference_config_file_path =
+      join_path({cache_folder, "configs", peft_model_id, "config.json"});
+  std::ifstream config_file(peft_inference_config_file_path);
+  if (config_file.is_open()) {
+    try {
+      json model_config;
+      config_file >> model_config;
+      rank = model_config["r"];
+      lora_alpha = model_config["lora_alpha"];
+      lora_dropout = model_config["lora_dropout"];
+    } catch (json::exception const &e) {
+      std::cerr << "Error parsing PEFT config from JSON file: " << e.what()
+                << std::endl;
+      assert(false);
+    }
+  } else {
+    std::cerr << "Error opening JSON file " << peft_inference_config_file_path
+              << std::endl;
+    assert(false);
+  }
+  optimizer_type = OPTIMIZER_TYPE_NONE;
+  learning_rate = 0.0f;
+  load_weights_from_file = true;
+}
 
 bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
   if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type &&
@@ -17,4 +54,17 @@ bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
   return false;
 }
 
+std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
+  os << "LoraLinearConfig: ";
+  os << "rank: " << llc.rank << ", ";
+  os << "optimizer_type: " << llc.optimizer_type << ", ";
+  os << "learning_rate: " << llc.learning_rate << ", ";
+  os << "cache_folder: " << llc.cache_folder << ", ";
+  os << "peft_model_id: " << llc.peft_model_id << ", ";
+  os << "lora_alpha: " << llc.lora_alpha << ", ";
+  os << "lora_dropout: " << llc.lora_dropout << ", ";
+  os << "load_weights_from_file: " << llc.load_weights_from_file << std::endl;
+  return os;
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 3c1b4d2570..1a9bd7704e 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -543,14 +543,14 @@ Legion::FutureMap
                                                     READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   // regions[2](I): weight
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(2, FID_DATA);
 
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 8411b42602..3433e2f21b 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -84,6 +84,9 @@ Combine::Combine(FFModel &model,
     dims[i] = _input->dims[i];
   }
   assert(combine_degree > 0 && "Must use combine_degree > 0");
+  std::cout << "combine_dim : " << combine_dim
+            << ", dims[combine_dim].degree: " << dims[combine_dim].degree
+            << ", combine_degree: " << combine_degree << std::endl;
   assert(dims[combine_dim].degree % combine_degree == 0);
   dims[combine_dim].degree /= combine_degree;
   ParallelTensorBase::update_parallel_ids(numdim, dims);

From 17fa6f3f514a0a6cdbf2179f6140bbe8b670ea3c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 31 Oct 2023 03:39:13 +0000
Subject: [PATCH 053/198] remove peft_bwd assertion failure in embedding

---
 include/flexflow/ops/embedding.h |  5 +++++
 src/ops/embedding.cc             | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h
index ae93ef4d1d..cd9ab4a775 100644
--- a/include/flexflow/ops/embedding.h
+++ b/include/flexflow/ops/embedding.h
@@ -60,6 +60,11 @@ class Embedding : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   // void update(const FFModel&);
   void print_layer(FFModel const &model) override {
     assert(0);
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index 007e799fe0..ea82a62071 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -609,6 +609,16 @@ void Embedding::backward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+Legion::FutureMap
+    Embedding::peft_bwd(FFModel const &ff,
+                        BatchConfigFuture const &bc,
+                        std::vector<ParallelTensor> const &batch_inputs,
+                        std::vector<ParallelTensor> const &batch_outputs,
+                        MachineView const *mv) {
+  // nothing to do (backward function only updates weights)
+  return FutureMap();
+}
+
 void Embedding::backward_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,

From cdc12e63014ccb644d1c6ebe7c6ffaf5582c7ceb Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 31 Oct 2023 15:52:21 -0400
Subject: [PATCH 054/198] fix download script

---
 inference/utils/download_hf_model.py   | 4 ++--
 inference/utils/download_peft_model.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py
index 03fc8e1633..94a8c23e68 100644
--- a/inference/utils/download_hf_model.py
+++ b/inference/utils/download_hf_model.py
@@ -36,9 +36,9 @@ def parse_args():
 
 def main(args):
     if args.full_precision_only:
-        data_types = ff.DataType.DT_FLOAT
+        data_types = (ff.DataType.DT_FLOAT,)
     elif args.half_precision_only:
-        data_types = ff.DataType.DT_HALF
+        data_types = (ff.DataType.DT_HALF,)
     else:
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index 1204634388..5c7704b6f0 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -36,9 +36,9 @@ def parse_args():
 
 def main(args):
     if args.full_precision_only:
-        data_types = ff.DataType.DT_FLOAT
+        data_types = (ff.DataType.DT_FLOAT,)
     elif args.half_precision_only:
-        data_types = ff.DataType.DT_HALF
+        data_types = (ff.DataType.DT_HALF,)
     else:
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 

From eb9e2b84c0fc9a629c97e17432ff26d4e08a5203 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 31 Oct 2023 16:54:11 -0400
Subject: [PATCH 055/198] add peft dependencies in dockerfile

---
 docker/flexflow-environment/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 0e9a3cda82..cae51f1446 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -95,6 +95,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
 RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y

From 3dfa14d5a9334a21c224c94785136cd04ffcc2b8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 31 Oct 2023 21:51:20 +0000
Subject: [PATCH 056/198] fix softmax backward

---
 src/ops/kernels/softmax.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 9ccce40c58..96d50e1ca4 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -317,11 +317,12 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
         GET_BLOCKS(num_bwd_tokens * num_classes),
         CUDA_NUM_THREADS,
         0,
-        stream>>>(input_grad_ptr + tokens_previous_requests * num_classes,
-                  output_grad_ptr + tokens_previous_requests * num_classes,
-                  token_ids,
-                  num_bwd_tokens,
-                  num_classes);
+        stream>>>(
+        input_grad_ptr + tokens_previous_requests * num_classes,
+        output_grad_ptr + tokens_previous_requests * num_classes,
+        static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
+        num_bwd_tokens,
+        num_classes);
 
     tokens_previous_requests += num_bwd_tokens;
   }

From 78523e892cd928687c42861ac1ce20d424b9de03 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 1 Nov 2023 04:04:14 +0000
Subject: [PATCH 057/198] fix bc print indentation

---
 src/runtime/batch_config.cc             | 4 ++--
 src/runtime/beam_search_batch_config.cc | 4 ++--
 src/runtime/tree_verify_batch_config.cc | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 1a6e32e582..f5d69d1992 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -124,8 +124,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       // PEFT values
-      os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
-      os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index 82674cce69..bfcf30454c 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -134,8 +134,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       // PEFT values
-      os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
-      os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index ea6e383453..f87500db74 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -55,8 +55,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       // PEFT values
-      os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
-      os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;

From bf78ea47c1477c68e359b55938e3b3c74015027d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 2 Nov 2023 19:42:04 +0000
Subject: [PATCH 058/198] Temporarily Revert "Update the default cublas
 behavior when CUDA_VERSION is not specified"

This reverts commit 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a.
---
 src/ops/inc_multihead_self_attention.cpp      | 14 ++++++--------
 src/ops/inc_multihead_self_attention.cu       | 12 ++++++------
 src/ops/kernels/linear_kernels.cpp            | 18 ++++++++----------
 src/ops/kernels/linear_kernels.cu             | 12 ++++++------
 src/ops/spec_inc_multihead_self_attention.cpp |  7 +++----
 src/ops/spec_inc_multihead_self_attention.cu  |  6 +++---
 src/ops/tree_inc_multihead_self_attention.cpp |  7 +++----
 src/ops/tree_inc_multihead_self_attention.cu  |  6 +++---
 8 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 4495f66844..8acdba7c25 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -257,11 +257,10 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F;
 #else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
@@ -510,11 +509,10 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b83d23804c..7c881bf961 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -238,11 +238,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
+#if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
@@ -873,11 +873,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
+#if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 2e8761472f..e24f5fe58f 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -274,12 +274,11 @@ void forward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
+  hipblasDatatype_t compute_type = input_type;
 #endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
@@ -440,12 +439,11 @@ void backward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
+  hipblasDatatype_t compute_type = HIPBLAS_R_32F;
 #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 4627179fc4..1897f11148 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -365,11 +365,11 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
+#if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = input_type;
 #endif
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
@@ -525,11 +525,11 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
+#if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = CUDA_R_32F;
 #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 6252693d1a..569dd7f1e5 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -200,11 +200,10 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index e986c4f34d..4338374dca 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -215,11 +215,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
+#if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
   // int tokens_previous_requests = 0;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 61117ce6df..e5bec2bc07 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -157,11 +157,10 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 04dc39cfa0..14253e8f61 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -158,11 +158,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
+#if CUDA_VERSION >= 11000
   // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = cublas_data_type;
 #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;

From b9e7f60b9ca1658fcf608c97b341cc21485a400c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 2 Nov 2023 16:32:33 -0400
Subject: [PATCH 059/198] Fix cublas default (#1220)

* Fix Legion prebuild workflow (2) (#1208)

* fix

* fix

* fix

* fix

* Fix Legion prebuild workflow (3) (#1210)

* fix hip error

* use CUBLAS_COMPUTE_FAST_16F for full-precision gemm

---------

Co-authored-by: Zhihao Jia <zhihao@cmu.edu>
---
 .github/workflows/helpers/prebuild_legion.sh |   2 +-
 .github/workflows/prebuild-legion.yml        |   6 +-
 CMakeLists.txt                               | 260 +++++++++----------
 config/config.linux                          |   4 +-
 src/ops/inc_multihead_self_attention.cu      |  26 +-
 src/ops/kernels/linear_kernels.cpp           |  18 +-
 src/ops/kernels/linear_kernels.cu            |  26 +-
 src/ops/spec_inc_multihead_self_attention.cu |  13 +-
 src/ops/tree_inc_multihead_self_attention.cu |  13 +-
 9 files changed, 200 insertions(+), 168 deletions(-)

diff --git a/.github/workflows/helpers/prebuild_legion.sh b/.github/workflows/helpers/prebuild_legion.sh
index ccaa58383e..9f5cbe147a 100755
--- a/.github/workflows/helpers/prebuild_legion.sh
+++ b/.github/workflows/helpers/prebuild_legion.sh
@@ -13,7 +13,7 @@ else
   echo "Pre-building Legion with GPU backend: ${gpu_backend}"
 fi
 
-if [[ "${gpu_backend}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+if [[ "${gpu_backend}" == "cuda" || "${gpu_backend}" == "hip_cuda" ]]; then
     # Check that CUDA version is supported. Versions above 12.0 not supported because we don't publish docker images for it yet.
     if [[ "$gpu_backend_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0) ]]; then
         echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0}"
diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml
index 1cf0ea2dd8..267daaee6b 100644
--- a/.github/workflows/prebuild-legion.yml
+++ b/.github/workflows/prebuild-legion.yml
@@ -42,12 +42,12 @@ jobs:
 
       - name: Build Legion
         env:
-          FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+          gpu_backend: ${{ matrix.gpu_backend }}
+          gpu_backend_version: ${{ matrix.gpu_backend_version }}
+          python_version: ${{ matrix.python_version }}
         run: .github/workflows/helpers/prebuild_legion.sh
 
       - name: Archive compiled Legion library (CUDA)
-        env:
-          FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
         uses: actions/upload-artifact@v3
         with:
           name: legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 648b46b49e..f9ce66a0f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -504,156 +504,156 @@ if(NOT BUILD_LEGION_ONLY)
       install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin")
     endif()
   endif()
-endif()
-
-if (INFERENCE_TESTS)
-  target_link_libraries(flexflow "${TORCH_LIBRARIES}")
-  set_property(TARGET flexflow PROPERTY CXX_STANDARD 14)
-endif()
-
-# build binary
-option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON)
-option(FF_BUILD_RESNET "build resnet example" OFF)
-option(FF_BUILD_RESNEXT "build resnext example" OFF)
-option(FF_BUILD_ALEXNET "build alexnet example" OFF)
-option(FF_BUILD_DLRM "build DLRM example" OFF)
-option(FF_BUILD_XDL "build XDL example" OFF)
-option(FF_BUILD_INCEPTION "build inception example" OFF)
-option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
-option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
-option(FF_BUILD_MOE "build mixture of experts example" OFF)
-option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
-option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
-option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
-option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF)
-option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF)
-option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
-option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
-option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
-option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
-
-if(FF_BUILD_UNIT_TESTS)
-  set(BUILD_GMOCK OFF)
-  add_subdirectory(deps/googletest)
-  enable_testing()
-  add_subdirectory(tests/unit)
-endif()
-
-  if(FF_BUILD_SUBSTITUTION_TOOL)
-    add_subdirectory(tools/protobuf_to_json)
+  
+  if (INFERENCE_TESTS)
+    target_link_libraries(flexflow "${TORCH_LIBRARIES}")
+    set_property(TARGET flexflow PROPERTY CXX_STANDARD 14)
   endif()
 
-  if(FF_BUILD_VISUALIZATION_TOOL)
-    add_subdirectory(tools/substitutions_to_dot)
+  # build binary
+  option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON)
+  option(FF_BUILD_RESNET "build resnet example" OFF)
+  option(FF_BUILD_RESNEXT "build resnext example" OFF)
+  option(FF_BUILD_ALEXNET "build alexnet example" OFF)
+  option(FF_BUILD_DLRM "build DLRM example" OFF)
+  option(FF_BUILD_XDL "build XDL example" OFF)
+  option(FF_BUILD_INCEPTION "build inception example" OFF)
+  option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
+  option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
+  option(FF_BUILD_MOE "build mixture of experts example" OFF)
+  option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
+  option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
+  option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
+  option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF)
+  option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF)
+  option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
+  option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
+  option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
+  option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
+
+  if(FF_BUILD_UNIT_TESTS)
+    set(BUILD_GMOCK OFF)
+    add_subdirectory(deps/googletest)
+    enable_testing()
+    add_subdirectory(tests/unit)
   endif()
 
-if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER)
-  if (FF_GPU_BACKEND STREQUAL "hip_rocm")
-    SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece")
-  endif()
-  # Ensure Rust is installed
-  execute_process(COMMAND rustc --version
-                RESULT_VARIABLE RUST_COMMAND_RESULT
-                OUTPUT_VARIABLE RUSTC_OUTPUT
-                ERROR_QUIET)
-  if(NOT RUST_COMMAND_RESULT EQUAL 0)
-    message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+    if(FF_BUILD_SUBSTITUTION_TOOL)
+      add_subdirectory(tools/protobuf_to_json)
+    endif()
+
+    if(FF_BUILD_VISUALIZATION_TOOL)
+      add_subdirectory(tools/substitutions_to_dot)
+    endif()
+
+  if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER)
+    if (FF_GPU_BACKEND STREQUAL "hip_rocm")
+      SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece")
+    endif()
+    # Ensure Rust is installed
+    execute_process(COMMAND rustc --version
+                  RESULT_VARIABLE RUST_COMMAND_RESULT
+                  OUTPUT_VARIABLE RUSTC_OUTPUT
+                  ERROR_QUIET)
+    if(NOT RUST_COMMAND_RESULT EQUAL 0)
+      message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+    endif()
+    # Ensure Cargo is installed
+    execute_process(COMMAND cargo --version
+                    RESULT_VARIABLE CARGO_RESULT
+                    OUTPUT_QUIET ERROR_QUIET)
+    if(NOT CARGO_RESULT EQUAL 0)
+      message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+    endif()
+    add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
+    target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
+    target_link_libraries(flexflow tokenizers_cpp)
   endif()
-  # Ensure Cargo is installed
-  execute_process(COMMAND cargo --version
-                  RESULT_VARIABLE CARGO_RESULT
-                  OUTPUT_QUIET ERROR_QUIET)
-  if(NOT CARGO_RESULT EQUAL 0)
-    message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+  if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/ResNet)
   endif()
-  add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
-  target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
-  target_link_libraries(flexflow tokenizers_cpp)
-endif()
-if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/ResNet)
-endif()
 
-if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/resnext50)
-endif()
+  if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/resnext50)
+  endif()
 
-if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/AlexNet)
-endif()
+  if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/AlexNet)
+  endif()
 
-if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/MLP_Unify)
-endif()
+  if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/MLP_Unify)
+  endif()
 
-if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/split_test)
-endif()
+  if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/split_test)
+  endif()
 
-if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/split_test_2)
-endif()
+  if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/split_test_2)
+  endif()
 
-if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/InceptionV3)
-endif()
+  if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/InceptionV3)
+  endif()
 
-#TODO: Once functional add to BUILD_ALL_EXAMPLES
-if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/candle_uno)
-endif()
+  #TODO: Once functional add to BUILD_ALL_EXAMPLES
+  if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/candle_uno)
+  endif()
 
-if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/DLRM)
+  if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/DLRM)
 
-  #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
-  #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+    #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
+    #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
 
-  #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
-  #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-endif()
+    #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
+    #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+  endif()
 
-if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/XDL)
-endif()
+  if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/XDL)
+  endif()
 
-if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/Transformer)
-endif()
+  if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/Transformer)
+  endif()
 
-if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/mixture_of_experts)
-endif()
+  if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(examples/cpp/mixture_of_experts)
+  endif()
 
-if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(inference/spec_infer)
-  add_subdirectory(inference/incr_decoding)
-endif()
+  if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
+    add_subdirectory(inference/spec_infer)
+    add_subdirectory(inference/incr_decoding)
+  endif()
 
 
-# installation
-set(INCLUDE_DEST "include")
-set(LIB_DEST "lib")
-install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
-install(TARGETS flexflow DESTINATION ${LIB_DEST})
-# install python
-if (FF_USE_PYTHON)
-  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT FF_BUILD_FROM_PYPI)
-    install(
-      DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
-      DESTINATION ${PY_DEST}/flexflow
-      FILES_MATCHING 
-      PATTERN "*.py")
-  else()
-    # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually.
-    install(
-      PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py
-      DESTINATION ${PY_DEST}/flexflow/core
-    )
-    # Use setup.py script to re-install the Python bindings library with the right library paths. 
-    # Need to put the instructions in a subfolder because of issue below:
-    # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake
-    add_subdirectory(cmake/pip_install)
+  # installation
+  set(INCLUDE_DEST "include")
+  set(LIB_DEST "lib")
+  install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
+  install(TARGETS flexflow DESTINATION ${LIB_DEST})
+  # install python
+  if (FF_USE_PYTHON)
+    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if (NOT FF_BUILD_FROM_PYPI)
+      install(
+        DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
+        DESTINATION ${PY_DEST}/flexflow
+        FILES_MATCHING 
+        PATTERN "*.py")
+    else()
+      # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually.
+      install(
+        PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py
+        DESTINATION ${PY_DEST}/flexflow/core
+      )
+      # Use setup.py script to re-install the Python bindings library with the right library paths. 
+      # Need to put the instructions in a subfolder because of issue below:
+      # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake
+      add_subdirectory(cmake/pip_install)
+    endif()
   endif()
-endif()
+endif() # if(NOT BUILD_LEGION_ONLY)
diff --git a/config/config.linux b/config/config.linux
index 5f15090a02..37b9bd16fd 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -10,7 +10,7 @@
 #LD_FLAGS=${LD_FLAGS+=""}
 
 #set install dir
-#INSTALL_DIR=
+INSTALL_DIR=${INSTALL_DIR:-}
 
 # set build type
 BUILD_TYPE=${BUILD_TYPE:-Release}
@@ -100,7 +100,7 @@ fi
 
 function get_build_configs() {
     # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}"
+    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 7c881bf961..c406435327 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -238,11 +238,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
@@ -873,11 +878,16 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index e24f5fe58f..2e7ae68314 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -274,11 +274,12 @@ void forward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = output_type;
 #else
-  hipblasDatatype_t compute_type = input_type;
+  // TODO: currently use the output_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  hipblasDatatype_t compute_type = output_type;
 #endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
@@ -439,11 +440,12 @@ void backward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  hipblasDatatype_t compute_type = output_type;
 #else
-  hipblasDatatype_t compute_type = HIPBLAS_R_32F;
+  // TODO: currently use output_type
+  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  hipblasDatatype_t compute_type = output_type;
 #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 1897f11148..dad6dc4e00 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -365,11 +365,16 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
 #else
-  cudaDataType_t compute_type = input_type;
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
@@ -525,11 +530,16 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
 #else
-  cudaDataType_t compute_type = CUDA_R_32F;
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 4338374dca..fb96862b81 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -215,11 +215,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   // int num_requests = bc->num_active_requests();
   // int tokens_previous_requests = 0;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 14253e8f61..8c2ee24132 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -158,11 +158,16 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;

From 463c75770a64c84c27660d60998b626cb88a4f9a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 3 Nov 2023 04:40:18 +0000
Subject: [PATCH 060/198] fix bugs, work on align opt-lora

---
 inference/incr_decoding/incr_decoding.cc | 20 ++++++++++----------
 inference/models/opt.cc                  |  6 ++++++
 src/ops/lora_linear.cc                   |  6 ++++--
 src/runtime/batch_config.cc              |  3 ++-
 src/runtime/beam_search_batch_config.cc  |  3 ++-
 src/runtime/request_manager.cc           |  6 ++++--
 src/runtime/tree_verify_batch_config.cc  |  3 ++-
 7 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 0017fe3fcb..90d1902716 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -284,22 +284,22 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
     std::vector<std::string> prompts;
-    std::vector<std::pair<std::string, std::string>> dataset;
+    // std::vector<std::pair<std::string, std::string>> dataset;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
       prompts.push_back(text);
-      dataset.push_back(std::make_pair(text, text));
+      // dataset.push_back(std::make_pair(text, text));
     }
-    rm->register_new_peft_request(
-        dataset, 256 /*max_sequence_length*/, peft_model_id);
-    for (auto &prompt : prompts) {
-      GenerationResult result =
-          model.generate(prompt, 128 /*max_sequence_length*/);
-    }
-    // GenerationResult result =
-    //     model.generate(prompts, 128 /*max_sequence_length*/);
+    // rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/,
+    // peft_model_id);
+    //  for (auto &prompt : prompts) {
+    //    GenerationResult result = model.generate(prompt, 128
+    //    /*max_sequence_length*/);
+    //  }
+    GenerationResult result =
+        model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id);
   }
 
   // Execution fence
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 9b29ae5410..9069aef9e1 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -214,6 +214,12 @@ void OPT::create_opt_model(FFModel &ff,
                    REG_MODE_NONE,
                    0.0f,
                    std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    ff.lora_linear(
+        activation,
+        fc2,
+        OP_LORA_MLP_SECOND,
+        std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str());
   }
 
   // final
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 3d2d8d6106..3515a879c9 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -269,6 +269,8 @@ void load_peft_from_file(DT *ptr,
                          size_t size,
                          int shard_id,
                          std::string filepath) {
+  std::cout << "Loading LORA weight " << filepath << ", size: " << size
+            << ", shard: " << shard_id << std::endl;
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
   if (!in.good()) {
     printf("Could not open file: %s\n", filepath.c_str());
@@ -443,10 +445,10 @@ void LoraLinear::inference_task(Task const *task,
                                 Runtime *runtime) {
   LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_active_peft_tokens() == 0) {
+  if (bc->num_active_tokens() == 0) {
     return;
   }
-  assert(regions.size() == 4);
+  assert(regions.size() == 2);
   assert(task->regions.size() == regions.size());
   assert(m->input_type[0] == m->output_type[0]);
 
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index f5d69d1992..22ab420674 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -124,7 +124,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       // PEFT values
-      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index bfcf30454c..cab8528994 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -134,7 +134,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       // PEFT values
-      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4128fee220..0b89010ab1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -187,7 +187,6 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_sequence_length = max_sequence_length;
   request.peft_model_id = peft_model_id;
-
   if (prompt.size() >= get_max_sequence_length()) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
               << get_max_sequence_length() << " tokens, but got "
@@ -547,6 +546,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
+        new_bc.requestsInfo[i].peft_model_id =
+            old_bc.requestsInfo[i].peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_sequence_length =
             old_bc.requestsInfo[i].max_sequence_length;
         if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
@@ -2093,7 +2095,7 @@ GenerationResult RequestManager::generate_incr_decoding(
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second);
     FutureMap fm = im->inference(llm, 0, bcf);
-    im->peft_bwd(llm, 0, bcf);
+    // im->peft_bwd(llm, 0, bcf);
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(std::make_pair(bcf, irf));
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index f87500db74..5702bb0a56 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -55,7 +55,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       // PEFT values
-      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl;
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;

From 7c65521e78c4106f80f4632a885d581efee3c8d5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 6 Nov 2023 14:58:20 +0000
Subject: [PATCH 061/198] update scripts

---
 tests/peft/fine_tune.sh   |  4 +--
 tests/peft/hf_finetune.py |  2 +-
 tests/peft/hf_serve.py    | 55 +++++++++++++++++++++++++--------------
 3 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh
index dbcdb849fa..eddb6139d0 100755
--- a/tests/peft/fine_tune.sh
+++ b/tests/peft/fine_tune.sh
@@ -7,8 +7,8 @@ cd "${BASH_SOURCE[0]%/*}"
 
 python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full
 python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half
-python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
-python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
+python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
+python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
 
 python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full
 python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index d702d23038..cf157a8913 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -29,7 +29,7 @@ def print_trainable_parameters(model):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf")
+    parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
     parser.add_argument("--lora-rank", type=int, default=16)
     parser.add_argument("--lora-alpha", type=int, default=32)
     parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target")
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 6f3753906f..efade301da 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -2,51 +2,68 @@
 import torch
 import os, sys
 from peft import PeftModel, PeftConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    LlamaTokenizer,
+    GenerationConfig,
+)
+
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama")
-    parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
     parser.add_argument("--max-new-tokens", type=int, default=50)
+    parser.add_argument("--do-sample", action="store_true", help="Use sampling")
     args = parser.parse_args()
     peft_model_id = args.peft_model_id
-    #peft_model_id = "goliaro/llama-7b-lora-half"
-    use_full_precision=args.use_full_precision
+    use_full_precision = args.use_full_precision
     max_new_tokens = args.max_new_tokens
 
     # Change working dir to folder storing this script
     abspath = os.path.abspath(__file__)
     dname = os.path.dirname(abspath)
     os.chdir(dname)
-    
+
     config = PeftConfig.from_pretrained(peft_model_id)
     model = AutoModelForCausalLM.from_pretrained(
-        config.base_model_name_or_path, 
-        return_dict=True, 
-        #load_in_8bit=True, 
-        torch_dtype = torch.float32 if use_full_precision else torch.float16,
-        device_map='auto',
+        config.base_model_name_or_path,
+        return_dict=True,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        device_map="auto",
+    )
+    hf_config = AutoConfig.from_pretrained(
+        config.base_model_name_or_path, trust_remote_code=True
     )
-    hf_config = AutoConfig.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
     hf_arch = getattr(hf_config, "architectures")[0]
     if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
         tokenizer = LlamaTokenizer.from_pretrained(
-            config.base_model_name_or_path, use_fast=True, 
-            torch_dtype = torch.float32 if use_full_precision else torch.float16,
+            config.base_model_name_or_path,
+            use_fast=True,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
         )
     else:
         tokenizer = AutoTokenizer.from_pretrained(
-            config.base_model_name_or_path, 
-            torch_dtype = torch.float32 if use_full_precision else torch.float16,
+            config.base_model_name_or_path,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
         )
-
+    # Generation config
+    generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path)
+    generation_config.do_sample = args.do_sample
     # Load the Lora model
     model = PeftModel.from_pretrained(model, peft_model_id)
-    batch = tokenizer("Two things are infinite: ", return_tensors='pt')
+    batch = tokenizer("Two things are infinite: ", return_tensors="pt")
     with torch.cuda.amp.autocast():
-        output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens)
-    print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))
+        output_tokens = model.generate(
+            **batch, max_new_tokens=max_new_tokens, generation_config=generation_config
+        )
+    print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
+
 
 if __name__ == "__main__":
     main()

From f4b3f8f56efac7476c7703b4be9b70b5d5bc9857 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 6 Nov 2023 20:30:41 +0000
Subject: [PATCH 062/198] add code to output peft tensors in hf

---
 .gitignore             |  1 +
 tests/peft/hf_serve.py | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index a032f80f77..0579eb5a74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -187,5 +187,6 @@ gpt_tokenizer
 python/flexflow/version.txt
 
 inference_tensors
+hf_peft_tensors
 
 Untitled-1.ipynb
\ No newline at end of file
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index efade301da..29baf5842b 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -1,6 +1,6 @@
 import argparse
 import torch
-import os, sys
+import os, sys, shutil
 from peft import PeftModel, PeftConfig
 from transformers import (
     AutoModelForCausalLM,
@@ -10,6 +10,18 @@
     GenerationConfig,
 )
 
+def peft_pre_forward_hook(module, input):
+    print("Pre-forward hook activated on module: ", module.name)
+    #print("Pre-Input: ", input)
+    torch.save(input, f"./hf_peft_tensors/{module.name}.input")
+    print("===")
+
+def peft_post_forward_hook(module, input, output):
+    print("Post-forward Hook activated for module: ", module.name)
+    #print("Post-Output: ", output)
+    torch.save(input, f"./hf_peft_tensors/{module.name}.output")
+    print("===")
+
 
 def main():
     parser = argparse.ArgumentParser()
@@ -19,10 +31,12 @@ def main():
     )
     parser.add_argument("--max-new-tokens", type=int, default=50)
     parser.add_argument("--do-sample", action="store_true", help="Use sampling")
+    parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file")
     args = parser.parse_args()
     peft_model_id = args.peft_model_id
     use_full_precision = args.use_full_precision
     max_new_tokens = args.max_new_tokens
+    save_peft_tensors = args.save_peft_tensors
 
     # Change working dir to folder storing this script
     abspath = os.path.abspath(__file__)
@@ -57,6 +71,27 @@ def main():
     generation_config.do_sample = args.do_sample
     # Load the Lora model
     model = PeftModel.from_pretrained(model, peft_model_id)
+    
+    # Register hooks to save tensors, if needed
+    if save_peft_tensors:
+        shutil.rmtree("./hf_peft_tensors")
+        # Check that the output folder exists
+        os.makedirs("./hf_peft_tensors", exist_ok=True)
+        # Save weights
+        for name, params in model.named_parameters():
+            if "lora" in name:
+                print(params, type(params))
+                torch.save(params, f"./hf_peft_tensors/{name}")
+                #params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+        # Save hidden states
+        for name, layer in dict(model.named_modules()).items():
+            if "lora_A.default" in name or "lora_B.default" in name:
+                layer.name = name
+                print(f"Adding hooks to layer {layer.name}")
+                layer.register_forward_pre_hook(peft_pre_forward_hook)
+                layer.register_forward_hook(peft_post_forward_hook)
+
+    
     batch = tokenizer("Two things are infinite: ", return_tensors="pt")
     with torch.cuda.amp.autocast():
         output_tokens = model.generate(

From 9e5fea995d14c8b0c599cf21753ce534594021fa Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 7 Nov 2023 04:17:55 +0000
Subject: [PATCH 063/198] update, fixes

---
 .../ops/kernels/lora_linear_kernels.h         |  2 +-
 include/flexflow/utils/cuda_helper.h          |  7 ++-
 include/flexflow/utils/hip_helper.h           |  7 ++-
 inference/incr_decoding/incr_decoding.cc      |  1 -
 src/ops/arg_topk.cc                           |  2 +-
 src/ops/argmax.cc                             |  8 ++--
 src/ops/beam_topk.cc                          |  6 +--
 src/ops/experts.cu                            | 16 +++----
 src/ops/kernels/lora_linear_kernels.cu        | 16 +++++--
 src/ops/lora_linear.cc                        | 47 ++++++++++++++++++-
 src/ops/sampling.cc                           |  2 +-
 src/runtime/cuda_helper.cu                    | 44 +++++++++++------
 src/runtime/hip_helper.cpp                    | 45 ++++++++++--------
 tests/peft/hf_serve.py                        | 15 ++++--
 14 files changed, 151 insertions(+), 67 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 32a6832e2e..cf03e518fa 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
 struct LoraLinearWeight {
   void *w0_ptr, *w1_ptr, *w0_grad_ptr, *w1_grad_ptr;
   void *w0_state_ptr, *w1_state_ptr;
-  int rank;
+  int in_dim, out_dim, rank;
 };
 
 class LoraLinearMeta : public OpMeta {
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f8bf67b3e1..983c20525e 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -156,10 +156,13 @@ template <typename T>
 void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
 
 template <typename T>
-T *download_tensor(T const *ptr, size_t num_elements);
+T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements);
 
 template <typename T>
-bool download_tensor(T const *ptr, T *dst, size_t num_elements);
+void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements);
+
+template <typename T>
+void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements);
 
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor,
                                                  Legion::Domain domain,
diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h
index 5d3c831d4f..b18567e1e7 100644
--- a/include/flexflow/utils/hip_helper.h
+++ b/include/flexflow/utils/hip_helper.h
@@ -141,10 +141,13 @@ template <typename T>
 void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
 
 template <typename T>
-T *download_tensor(T const *ptr, size_t num_elements);
+T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements);
 
 template <typename T>
-bool download_tensor(T const *ptr, T *dst, size_t num_elements);
+void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements);
+
+template <typename T>
+void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements);
 
 miopenStatus_t
     cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 7c4cef0973..1921e05323 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -158,7 +158,6 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length);
-
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 19b9bff1f6..b937b35b73 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -329,7 +329,7 @@ InferenceResult
   }
 
   InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index daefaf3b98..e094abbf13 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -354,10 +354,10 @@ BeamInferenceResult
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
 
   BeamInferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
-  download_tensor(m->probs, ir.probs, batch_size);
-  download_tensor<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
+  copy_tensor_dev_to_host(m->probs, ir.probs, batch_size);
+  copy_tensor_dev_to_host<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -398,7 +398,7 @@ InferenceResult
     ArgMax::save_inference_tensors_to_file(
         m, shard_id, bc, {}, {}, {input, indices});
   }
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index 109937ee0b..d3166af392 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -393,9 +393,9 @@ BeamInferenceResult
 
   BeamInferenceResult ir;
 
-  download_tensor<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
-  download_tensor<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
-  download_tensor<int>(
+  copy_tensor_dev_to_host<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<int>(
       parent_ptr, ir.parent_id, batch_size * m->max_beam_width);
 
   if (m->inference_debugging) {
diff --git a/src/ops/experts.cu b/src/ops/experts.cu
index 6f0bd8afbb..614d755a35 100644
--- a/src/ops/experts.cu
+++ b/src/ops/experts.cu
@@ -579,14 +579,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
 #ifdef INFERENCE_TESTS
   // Checking
   // 1. check that m->sorted_indices contains indices sorted
-  int *indices_cpu = download_tensor<int>(indices, num_indices);
+  int *indices_cpu = copy_tensor_dev_to_host<int>(indices, num_indices);
   // assert(indices_cpu != nullptr);
   std::vector<int> indices_vec(indices_cpu, indices_cpu + num_indices);
   std::vector<int> indices_vec_sorted(indices_vec.size());
   std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin());
   std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end());
 
-  int *thrust_sorted_indices_cpu = download_tensor<int>(
+  int *thrust_sorted_indices_cpu = copy_tensor_dev_to_host<int>(
       m->sorted_indices, m->num_chosen_experts * m->effective_batch_size);
   // assert(thrust_sorted_indices_cpu != nullptr);
   std::vector<int> thrust_sorted_indices_vec(
@@ -613,7 +613,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
     assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]);
   }
   // 2. check that indices[m->original_indices[i]] = i
-  int *thrust_original_indices_cpu = download_tensor<int>(
+  int *thrust_original_indices_cpu = copy_tensor_dev_to_host<int>(
       m->original_indices, m->num_chosen_experts * m->effective_batch_size);
   // assert(thrust_original_indices_cpu != nullptr);
   std::vector<int> thrust_original_indices_vec(
@@ -669,7 +669,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   assert(non_zero_experts_count == non_zero_experts_check.size());
   // 7. check exp_local_label_to_index
   int *non_zero_expert_labels_cpu =
-      download_tensor<int>(m->non_zero_expert_labels, non_zero_experts_count);
+      copy_tensor_dev_to_host<int>(m->non_zero_expert_labels, non_zero_experts_count);
   // assert(non_zero_expert_labels_cpu != nullptr);
   std::vector<int> non_zero_expert_labels_vec(non_zero_expert_labels_cpu,
                                               non_zero_expert_labels_cpu +
@@ -685,7 +685,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   assert(non_zero_expert_labels_vec == non_zero_experts_check_vec);
 
   int *exp_local_label_to_index =
-      download_tensor<int>(m->exp_local_label_to_index, non_zero_experts_count);
+      copy_tensor_dev_to_host<int>(m->exp_local_label_to_index, non_zero_experts_count);
   // assert(exp_local_label_to_index != nullptr);
   std::vector<int> exp_local_label_to_index_vec(exp_local_label_to_index,
                                                 exp_local_label_to_index +
@@ -700,7 +700,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
 
   // 8. Check expert_start_indexes
   int *expert_start_indices_thrust =
-      download_tensor<int>(m->expert_start_indexes, non_zero_experts_count + 1);
+      copy_tensor_dev_to_host<int>(m->expert_start_indexes, non_zero_experts_count + 1);
   // assert(expert_start_indices_thrust != nullptr);
   std::vector<int> expert_start_indices_thrust_vec(
       expert_start_indices_thrust,
@@ -746,7 +746,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int *num_assignments_per_expert_thrust =
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(num_assignments_per_expert_thrust != nullptr);
-  assert(download_tensor<int>(m->num_assignments_per_expert,
+  assert(copy_tensor_dev_to_host<int>(m->num_assignments_per_expert,
                               num_assignments_per_expert_thrust,
                               non_zero_experts_count));
   assert(num_assignments_per_expert_thrust != nullptr);
@@ -759,7 +759,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int *destination_start_indices_thrust =
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(destination_start_indices_thrust != nullptr);
-  assert(download_tensor<int>(m->destination_start_indices,
+  assert(copy_tensor_dev_to_host<int>(m->destination_start_indices,
                               destination_start_indices_thrust,
                               non_zero_experts_count));
   assert(destination_start_indices_thrust != nullptr);
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 50b6884a5b..2d271efe72 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -148,8 +148,13 @@ void inference_kernel(LoraLinearMeta *m,
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = output_type;
 #else
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->input_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   int num_peft_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
@@ -263,10 +268,15 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
+  cudaDataType_t compute_type = output_type;
 #else
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 3515a879c9..47d793446d 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -6,6 +6,8 @@
 #include "flexflow/utils/hash_utils.h"
 #include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
+#include <sys/stat.h>
+#include <sys/types.h>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "flexflow/utils/cuda_helper.h"
 #else
@@ -215,6 +217,7 @@ OpMeta *LoraLinear::init_task(Task const *task,
   LoraLinearMeta *m = new LoraLinearMeta(handle, lora);
   m->trainable_inputs[0] = lora->trainable_inputs[0];
   std::strcpy(m->op_name, lora->name);
+  m->layer_guid = lora->layer_guid;
 
   return m;
 }
@@ -290,7 +293,7 @@ void load_peft_from_file(DT *ptr,
     assert(false);
   }
   assert(size == host_array.size());
-  copy_kernel(ptr, host_array.data(), target_data_size);
+  copy_tensor_host_to_dev(ptr, host_array.data(), size);
   in.close();
 }
 
@@ -321,6 +324,9 @@ void LoraLinear::register_model_task(Task const *task,
   assert(m->model_weights.find(info->model_id) == m->model_weights.end());
 
   LoraLinearWeight weight;
+  weight.in_dim = in_dim;
+  weight.out_dim = out_dim;
+  weight.rank = rank;
   PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
   weight.w0_ptr = allocator->allocate_local_weights_untyped(
       info->model_id, w0_num_elements * data_type_size(dt));
@@ -367,7 +373,6 @@ void LoraLinear::register_model_task(Task const *task,
     assert(false && "Data type not supported");
   }
 
-  weight.rank = rank;
   if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
     // Input is partitioned (no replication)
     // w0_grad is local weight gradients
@@ -462,6 +467,44 @@ void LoraLinear::inference_task(Task const *task,
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
   inference_kernel_wrapper(m, bc, input, output);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+
+    // Check if output directory exists, and create it if it does not
+    char const *folder_path = "./inference_tensors";
+    struct stat st = {0};
+    if (stat(folder_path, &st) == -1) {
+      // Directory does not exist, create it
+      mkdir(folder_path, 0700);
+    }
+    // output base filepath, shared by all tensors from the same operator
+    std::string base_filepath =
+        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
+        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
+        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
+        m->op_name + "_shard-id_" + std::to_string(shard_id);
+    std::cout << "base_filepath: " << base_filepath << std::endl;
+    std::cout << "m->decoding_step: " << m->decoding_step << std::endl;
+    if (m->decoding_step == 0) {
+      for (auto it = m->model_weights.begin(); it != m->model_weights.end(); ++it) {
+        PEFTModelID peft_model_id = it->first;
+        LoraLinearWeight weight = m->model_weights[peft_model_id];
+        std::string filenameA = base_filepath + "_weight_A";
+        std::string filenameB = base_filepath + "_weight_B";
+        if (m->input_type[0] == DT_FLOAT) {
+          save_tensor((float*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str());
+          save_tensor((float*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str());
+        } else if (m->input_type[0] == DT_HALF) {
+          save_tensor((half*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str());
+          save_tensor((half*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str());
+        } else {
+          assert(false && "Data type not supported");
+        }
+      }
+    }
+    LoraLinear::save_inference_tensors_to_file(m, shard_id, bc, {input}, {}, {output});
+  }
 }
 
 FutureMap LoraLinear::peft_bwd(FFModel const &ff,
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index e98c7f0ec3..4cec9a50b7 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -313,7 +313,7 @@ InferenceResult
   }
 
   InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index fa6bf55fe5..74575ea6ba 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -375,7 +375,7 @@ __host__ void save_tensor(int64_t const *ptr,
 }
 
 template <typename T>
-__host__ T *download_tensor(T const *ptr, size_t num_elements) {
+__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
@@ -388,14 +388,23 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) {
 }
 
 template <typename T>
-__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) {
+__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
   checkCUDA(cudaMemcpyAsync(
       dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream));
-  return true;
 }
+
+template <typename T>
+__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(src != nullptr);
+  checkCUDA(cudaMemcpyAsync(
+      dst, src, sizeof(T) * num_elements, cudaMemcpyHostToDevice, stream));
+}
+
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(
     cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) {
   int dims[MAX_TENSOR_DIM];
@@ -700,26 +709,31 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
-template __host__ float *download_tensor<float>(float const *ptr,
+template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
                                                 size_t num_elements);
-template __host__ half *download_tensor<half>(half const *ptr,
+template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
                                               size_t num_elements);
-template __host__ double *download_tensor<double>(double const *ptr,
+template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
                                                   size_t num_elements);
-template __host__ int32_t *download_tensor<int32_t>(int32_t const *ptr,
+template __host__ int32_t *copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
                                                     size_t num_elements);
-template __host__ int64_t *download_tensor<int64_t>(int64_t const *ptr,
+template __host__ int64_t *copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
                                                     size_t num_elements);
-template __host__ bool
-    download_tensor<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ bool
-    download_tensor<half>(half const *ptr, half *dst, size_t num_elements);
-template __host__ bool download_tensor<double>(double const *ptr,
+template __host__ void
+    copy_tensor_dev_to_host<float>(float const *ptr, float *dst, size_t num_elements);
+template __host__ void
+    copy_tensor_dev_to_host<half>(half const *ptr, half *dst, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
                                                double *dst,
                                                size_t num_elements);
-template __host__ bool download_tensor<int32_t>(int32_t const *ptr,
+template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
                                                 int32_t *dst,
                                                 size_t num_elements);
-template __host__ bool download_tensor<int64_t>(int64_t const *ptr,
+template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
                                                 int64_t *dst,
                                                 size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst, float const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst, half const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst, double const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst, int32_t const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst, int64_t const *src, size_t num_elements);
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index fb94135c8f..ac0e7c157f 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -354,9 +354,7 @@ __host__ void save_tensor(int64_t const *ptr,
 }
 
 template <typename T>
-__host__ T *download_tensor(T const *ptr, size_t num_elements) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
@@ -365,21 +363,25 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) {
                           hipHostMallocPortable | hipHostMallocMapped));
   checkCUDA(hipMemcpyAsync(
       host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
-  // checkCUDA(hipDeviceSynchronize());
   return host_ptr;
 }
 
 template <typename T>
-__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
   checkCUDA(hipMemcpyAsync(
       dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
-  // checkCUDA(hipDeviceSynchronize());
-  return true;
+}
+
+template <typename T>
+__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(src != nullptr);
+  checkCUDA(hipMemcpyAsync(
+      dst, src, sizeof(T) * num_elements, hipMemcpyHostToDevice, stream));
 }
 
 miopenStatus_t cudnnSetTensorDescriptorFromDomain(
@@ -610,24 +612,29 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
-template __host__ float *download_tensor<float>(float const *ptr,
+template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
                                                 size_t num_elements);
-template __host__ half *download_tensor<half>(half const *ptr,
+template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
                                               size_t num_elements);
-template __host__ double *download_tensor<double>(double const *ptr,
+template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
                                                   size_t num_elements);
-template __host__ int32_t *download_tensor<int32_t>(int32_t const *ptr,
+template __host__ int32_t *copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
                                                     size_t num_elements);
-template __host__ int64_t *download_tensor<int64_t>(int64_t const *ptr,
+template __host__ int64_t *copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
                                                     size_t num_elements);
-template __host__ bool
-    download_tensor<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ bool download_tensor<double>(double const *ptr,
+template __host__ void
+    copy_tensor_dev_to_host<float>(float const *ptr, float *dst, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
                                                double *dst,
                                                size_t num_elements);
-template __host__ bool download_tensor<int32_t>(int32_t const *ptr,
+template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
                                                 int32_t *dst,
                                                 size_t num_elements);
-template __host__ bool download_tensor<int64_t>(int64_t const *ptr,
+template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
                                                 int64_t *dst,
                                                 size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst, float const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst, half const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst, double const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst, int32_t const *src, size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst, int64_t const *src, size_t num_elements);
\ No newline at end of file
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 29baf5842b..0af515e6a9 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -11,16 +11,21 @@
 )
 
 def peft_pre_forward_hook(module, input):
-    print("Pre-forward hook activated on module: ", module.name)
+    assert(module.name is not None and module.decoding_step is not None)
+    name = module.name.replace("base_model.model.model", "")
+    print(f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}")
     #print("Pre-Input: ", input)
-    torch.save(input, f"./hf_peft_tensors/{module.name}.input")
+    torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input")
     print("===")
 
 def peft_post_forward_hook(module, input, output):
-    print("Post-forward Hook activated for module: ", module.name)
+    assert(module.name is not None and module.decoding_step is not None)
+    name = module.name.replace("base_model.model.model", "")
+    print(f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}")
     #print("Post-Output: ", output)
-    torch.save(input, f"./hf_peft_tensors/{module.name}.output")
+    torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output")
     print("===")
+    module.decoding_step += 1
 
 
 def main():
@@ -80,13 +85,13 @@ def main():
         # Save weights
         for name, params in model.named_parameters():
             if "lora" in name:
-                print(params, type(params))
                 torch.save(params, f"./hf_peft_tensors/{name}")
                 #params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
         # Save hidden states
         for name, layer in dict(model.named_modules()).items():
             if "lora_A.default" in name or "lora_B.default" in name:
                 layer.name = name
+                layer.decoding_step = 0
                 print(f"Adding hooks to layer {layer.name}")
                 layer.register_forward_pre_hook(peft_pre_forward_hook)
                 layer.register_forward_hook(peft_post_forward_hook)

From 62edfaa92a41093ee6d7cdbd6e2f2dd4b7799f38 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 7 Nov 2023 04:18:09 +0000
Subject: [PATCH 064/198] linting

---
 src/ops/argmax.cc          |  3 +-
 src/ops/beam_topk.cc       |  6 ++--
 src/ops/experts.cu         | 20 ++++++------
 src/ops/lora_linear.cc     | 22 ++++++++++----
 src/runtime/cuda_helper.cu | 62 +++++++++++++++++++++++---------------
 src/runtime/hip_helper.cpp | 57 +++++++++++++++++++++--------------
 6 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index e094abbf13..cabb8b204f 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -357,7 +357,8 @@ BeamInferenceResult
   copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   copy_tensor_dev_to_host(m->probs, ir.probs, batch_size);
-  copy_tensor_dev_to_host<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
+  copy_tensor_dev_to_host<int>(
+      parent.get_int32_ptr(), ir.parent_id, batch_size);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index d3166af392..a660a80301 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -393,8 +393,10 @@ BeamInferenceResult
 
   BeamInferenceResult ir;
 
-  copy_tensor_dev_to_host<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
-  copy_tensor_dev_to_host<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<int>(
+      index_ptr, ir.token_ids, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<float>(
+      value_ptr, ir.probs, batch_size * m->max_beam_width);
   copy_tensor_dev_to_host<int>(
       parent_ptr, ir.parent_id, batch_size * m->max_beam_width);
 
diff --git a/src/ops/experts.cu b/src/ops/experts.cu
index 614d755a35..f6f555d1ad 100644
--- a/src/ops/experts.cu
+++ b/src/ops/experts.cu
@@ -668,8 +668,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
   assert(non_zero_experts_count == non_zero_experts_check.size());
   // 7. check exp_local_label_to_index
-  int *non_zero_expert_labels_cpu =
-      copy_tensor_dev_to_host<int>(m->non_zero_expert_labels, non_zero_experts_count);
+  int *non_zero_expert_labels_cpu = copy_tensor_dev_to_host<int>(
+      m->non_zero_expert_labels, non_zero_experts_count);
   // assert(non_zero_expert_labels_cpu != nullptr);
   std::vector<int> non_zero_expert_labels_vec(non_zero_expert_labels_cpu,
                                               non_zero_expert_labels_cpu +
@@ -684,8 +684,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                         non_zero_experts_check_vec.end()));
   assert(non_zero_expert_labels_vec == non_zero_experts_check_vec);
 
-  int *exp_local_label_to_index =
-      copy_tensor_dev_to_host<int>(m->exp_local_label_to_index, non_zero_experts_count);
+  int *exp_local_label_to_index = copy_tensor_dev_to_host<int>(
+      m->exp_local_label_to_index, non_zero_experts_count);
   // assert(exp_local_label_to_index != nullptr);
   std::vector<int> exp_local_label_to_index_vec(exp_local_label_to_index,
                                                 exp_local_label_to_index +
@@ -699,8 +699,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
 
   // 8. Check expert_start_indexes
-  int *expert_start_indices_thrust =
-      copy_tensor_dev_to_host<int>(m->expert_start_indexes, non_zero_experts_count + 1);
+  int *expert_start_indices_thrust = copy_tensor_dev_to_host<int>(
+      m->expert_start_indexes, non_zero_experts_count + 1);
   // assert(expert_start_indices_thrust != nullptr);
   std::vector<int> expert_start_indices_thrust_vec(
       expert_start_indices_thrust,
@@ -747,8 +747,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(num_assignments_per_expert_thrust != nullptr);
   assert(copy_tensor_dev_to_host<int>(m->num_assignments_per_expert,
-                              num_assignments_per_expert_thrust,
-                              non_zero_experts_count));
+                                      num_assignments_per_expert_thrust,
+                                      non_zero_experts_count));
   assert(num_assignments_per_expert_thrust != nullptr);
   std::vector<int> num_assignments_per_expert_thrust_vec(
       num_assignments_per_expert_thrust,
@@ -760,8 +760,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(destination_start_indices_thrust != nullptr);
   assert(copy_tensor_dev_to_host<int>(m->destination_start_indices,
-                              destination_start_indices_thrust,
-                              non_zero_experts_count));
+                                      destination_start_indices_thrust,
+                                      non_zero_experts_count));
   assert(destination_start_indices_thrust != nullptr);
   std::vector<int> destination_start_indices_thrust_vec(
       destination_start_indices_thrust,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 47d793446d..8115026f02 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -487,23 +487,33 @@ void LoraLinear::inference_task(Task const *task,
     std::cout << "base_filepath: " << base_filepath << std::endl;
     std::cout << "m->decoding_step: " << m->decoding_step << std::endl;
     if (m->decoding_step == 0) {
-      for (auto it = m->model_weights.begin(); it != m->model_weights.end(); ++it) {
+      for (auto it = m->model_weights.begin(); it != m->model_weights.end();
+           ++it) {
         PEFTModelID peft_model_id = it->first;
         LoraLinearWeight weight = m->model_weights[peft_model_id];
         std::string filenameA = base_filepath + "_weight_A";
         std::string filenameB = base_filepath + "_weight_B";
         if (m->input_type[0] == DT_FLOAT) {
-          save_tensor((float*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str());
-          save_tensor((float*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str());
+          save_tensor((float *)weight.w0_ptr,
+                      weight.rank * weight.in_dim,
+                      filenameA.c_str());
+          save_tensor((float *)weight.w1_ptr,
+                      weight.rank * weight.out_dim,
+                      filenameB.c_str());
         } else if (m->input_type[0] == DT_HALF) {
-          save_tensor((half*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str());
-          save_tensor((half*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str());
+          save_tensor((half *)weight.w0_ptr,
+                      weight.rank * weight.in_dim,
+                      filenameA.c_str());
+          save_tensor((half *)weight.w1_ptr,
+                      weight.rank * weight.out_dim,
+                      filenameB.c_str());
         } else {
           assert(false && "Data type not supported");
         }
       }
     }
-    LoraLinear::save_inference_tensors_to_file(m, shard_id, bc, {input}, {}, {output});
+    LoraLinear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {output});
   }
 }
 
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 74575ea6ba..58d3dc8012 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -388,7 +388,8 @@ __host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
 }
 
 template <typename T>
-__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
+__host__ void
+    copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
@@ -397,7 +398,8 @@ __host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements)
 }
 
 template <typename T>
-__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+__host__ void
+    copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(src != nullptr);
@@ -710,30 +712,42 @@ template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
 template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
-                                                size_t num_elements);
+                                                        size_t num_elements);
 template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
-                                              size_t num_elements);
+                                                      size_t num_elements);
 template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
-                                                  size_t num_elements);
-template __host__ int32_t *copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
-                                                    size_t num_elements);
-template __host__ int64_t *copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
-                                                    size_t num_elements);
-template __host__ void
-    copy_tensor_dev_to_host<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ void
-    copy_tensor_dev_to_host<half>(half const *ptr, half *dst, size_t num_elements);
+                                                          size_t num_elements);
+template __host__ int32_t *
+    copy_tensor_dev_to_host<int32_t>(int32_t const *ptr, size_t num_elements);
+template __host__ int64_t *
+    copy_tensor_dev_to_host<int64_t>(int64_t const *ptr, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<float>(float const *ptr,
+                                                      float *dst,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<half>(half const *ptr,
+                                                     half *dst,
+                                                     size_t num_elements);
 template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
-                                               double *dst,
-                                               size_t num_elements);
+                                                       double *dst,
+                                                       size_t num_elements);
 template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
-                                                int32_t *dst,
-                                                size_t num_elements);
+                                                        int32_t *dst,
+                                                        size_t num_elements);
 template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
-                                                int64_t *dst,
-                                                size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<float>(float *dst, float const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<half>(half *dst, half const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<double>(double *dst, double const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst, int32_t const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst, int64_t const *src, size_t num_elements);
+                                                        int64_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst,
+                                                      float const *src,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst,
+                                                     half const *src,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst,
+                                                       double const *src,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst,
+                                                        int32_t const *src,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst,
+                                                        int64_t const *src,
+                                                        size_t num_elements);
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index ac0e7c157f..5ab86deaab 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -367,7 +367,8 @@ __host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
 }
 
 template <typename T>
-__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
+__host__ void
+    copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
@@ -376,7 +377,8 @@ __host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements)
 }
 
 template <typename T>
-__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+__host__ void
+    copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(src != nullptr);
@@ -613,28 +615,39 @@ template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
 template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
-                                                size_t num_elements);
+                                                        size_t num_elements);
 template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
-                                              size_t num_elements);
+                                                      size_t num_elements);
 template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
-                                                  size_t num_elements);
-template __host__ int32_t *copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
-                                                    size_t num_elements);
-template __host__ int64_t *copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
-                                                    size_t num_elements);
-template __host__ void
-    copy_tensor_dev_to_host<float>(float const *ptr, float *dst, size_t num_elements);
+                                                          size_t num_elements);
+template __host__ int32_t *
+    copy_tensor_dev_to_host<int32_t>(int32_t const *ptr, size_t num_elements);
+template __host__ int64_t *
+    copy_tensor_dev_to_host<int64_t>(int64_t const *ptr, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<float>(float const *ptr,
+                                                      float *dst,
+                                                      size_t num_elements);
 template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
-                                               double *dst,
-                                               size_t num_elements);
+                                                       double *dst,
+                                                       size_t num_elements);
 template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
-                                                int32_t *dst,
-                                                size_t num_elements);
+                                                        int32_t *dst,
+                                                        size_t num_elements);
 template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
-                                                int64_t *dst,
-                                                size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<float>(float *dst, float const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<half>(half *dst, half const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<double>(double *dst, double const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst, int32_t const *src, size_t num_elements);
-template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst, int64_t const *src, size_t num_elements);
\ No newline at end of file
+                                                        int64_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst,
+                                                      float const *src,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst,
+                                                     half const *src,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst,
+                                                       double const *src,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst,
+                                                        int32_t const *src,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst,
+                                                        int64_t const *src,
+                                                        size_t num_elements);
\ No newline at end of file

From ddb5c2928608e5b489d666747b340872c7bd582e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 7 Nov 2023 23:32:36 +0000
Subject: [PATCH 065/198] fix printing of tensors for numpy

---
 src/runtime/cuda_helper.cu | 6 +++++-
 src/runtime/hip_helper.cpp | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 58d3dc8012..c2b2affc40 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -287,7 +287,11 @@ __host__ void
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
   for (unsigned i = 0; i < num_elements; i++) {
-    fprintf(tensor_file, "%.9f, ", host_ptr[i]);
+    if (i < num_elements - 1) {
+      fprintf(tensor_file, "%.9f, ", host_ptr[i]);
+    } else {
+      fprintf(tensor_file, "%.9f", host_ptr[i]);
+    }
   }
 
   fclose(tensor_file);
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index 5ab86deaab..0ffc1a895d 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -266,7 +266,11 @@ __host__ void
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
   for (unsigned i = 0; i < num_elements; i++) {
-    fprintf(tensor_file, "%.9f, ", host_ptr[i]);
+    if (i < num_elements - 1) {
+      fprintf(tensor_file, "%.9f, ", host_ptr[i]);
+    } else {
+      fprintf(tensor_file, "%.9f", host_ptr[i]);
+    }
   }
 
   fclose(tensor_file);

From d276496f705e53c89f90c638ad4cc24cd03dcf53 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 8 Nov 2023 02:16:47 +0000
Subject: [PATCH 066/198] update save_inference_tensors_to_file

---
 src/ops/lora_linear.cc  | 53 ++++++++++++++++++++++++++++++++++++-----
 src/runtime/operator.cc |  3 ++-
 2 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 8115026f02..ffd5f6a958 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -467,6 +467,7 @@ void LoraLinear::inference_task(Task const *task,
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
   inference_kernel_wrapper(m, bc, input, output);
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -478,17 +479,47 @@ void LoraLinear::inference_task(Task const *task,
       // Directory does not exist, create it
       mkdir(folder_path, 0700);
     }
+
+    std::string lora_layername = std::string(m->op_name);
+    std::string searchString = "lora";
+    size_t found = lora_layername.find(searchString);
+    if (found == std::string::npos) {
+      std::cout << "LoraLinear layer name not in the right format (does not "
+                  "contain word 'lora')"
+                << std::endl;
+      assert(false);
+    }
+    std::string lora_layername_substr =
+        lora_layername.substr(0, found + searchString.length());
+
     // output base filepath, shared by all tensors from the same operator
     std::string base_filepath =
         "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
         "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
         std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        m->op_name + "_shard-id_" + std::to_string(shard_id);
-    std::cout << "base_filepath: " << base_filepath << std::endl;
-    std::cout << "m->decoding_step: " << m->decoding_step << std::endl;
+        lora_layername_substr + "_shard-id_" + std::to_string(shard_id);
+
+    // save batch config, if passed
+    if (bc != nullptr) {
+      bc->save_to_file(base_filepath + "_batch-config");
+    }
+
+    std::string filename = base_filepath + "_input_" + std::to_string(0);
+    if (input.data_type == DT_FLOAT) {
+      save_tensor(
+          input.get_float_ptr(), input.domain.get_volume(), filename.c_str());
+    } else if (input.data_type == DT_HALF) {
+      save_tensor(
+          input.get_half_ptr(), input.domain.get_volume(), filename.c_str());
+    } else {
+      assert(false);
+    }
+
+    // std::cout << "base_filepath: " << base_filepath << std::endl;
+    // std::cout << "m->decoding_step: " << m->decoding_step << std::endl;
     if (m->decoding_step == 0) {
       for (auto it = m->model_weights.begin(); it != m->model_weights.end();
-           ++it) {
+          ++it) {
         PEFTModelID peft_model_id = it->first;
         LoraLinearWeight weight = m->model_weights[peft_model_id];
         std::string filenameA = base_filepath + "_weight_A";
@@ -512,8 +543,18 @@ void LoraLinear::inference_task(Task const *task,
         }
       }
     }
-    LoraLinear::save_inference_tensors_to_file(
-        m, shard_id, bc, {input}, {}, {output});
+
+    filename = base_filepath + "_output_" + std::to_string(0);
+    if (output.data_type == DT_FLOAT) {
+      save_tensor(
+          output.get_float_ptr(), output.domain.get_volume(), filename.c_str());
+    } else if (output.data_type == DT_HALF) {
+      save_tensor(
+          output.get_half_ptr(), output.domain.get_volume(), filename.c_str());
+    } else {
+      assert(false);
+    }
+    m->decoding_step++;
   }
 }
 
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index 0b3813f41c..c60fa08814 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -26,8 +26,9 @@ size_t Op::get_params_hash() const {
 }
 
 /*static*/
+template <typename OpMetaType>
 void Op::save_inference_tensors_to_file(
-    OpMeta *m,
+    OpMetaType *m,
     int shard_id,
     BatchConfig const *bc,
     std::vector<GenericTensorAccessorR> input_tensors,

From bc79d3b536cfcb6de0ac5f6dbfacb10492b9d3de Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 8 Nov 2023 02:17:16 +0000
Subject: [PATCH 067/198] linting

---
 src/ops/lora_linear.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index ffd5f6a958..4054173c2f 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -485,7 +485,7 @@ void LoraLinear::inference_task(Task const *task,
     size_t found = lora_layername.find(searchString);
     if (found == std::string::npos) {
       std::cout << "LoraLinear layer name not in the right format (does not "
-                  "contain word 'lora')"
+                   "contain word 'lora')"
                 << std::endl;
       assert(false);
     }
@@ -519,7 +519,7 @@ void LoraLinear::inference_task(Task const *task,
     // std::cout << "m->decoding_step: " << m->decoding_step << std::endl;
     if (m->decoding_step == 0) {
       for (auto it = m->model_weights.begin(); it != m->model_weights.end();
-          ++it) {
+           ++it) {
         PEFTModelID peft_model_id = it->first;
         LoraLinearWeight weight = m->model_weights[peft_model_id];
         std::string filenameA = base_filepath + "_weight_A";

From 8e34632c94924e0db444cc7a3bbe53fe0a38434d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 8 Nov 2023 02:17:22 +0000
Subject: [PATCH 068/198] update

---
 tests/peft/hf_serve.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 0af515e6a9..6e143550c8 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -12,18 +12,18 @@
 
 def peft_pre_forward_hook(module, input):
     assert(module.name is not None and module.decoding_step is not None)
-    name = module.name.replace("base_model.model.model", "")
+    name = module.name.replace("base_model.model.model.", "")
     print(f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}")
-    #print("Pre-Input: ", input)
+    print("Pre-Input: ", input[0].shape)
     torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input")
-    print("===")
+    #print("===")
 
 def peft_post_forward_hook(module, input, output):
     assert(module.name is not None and module.decoding_step is not None)
-    name = module.name.replace("base_model.model.model", "")
+    name = module.name.replace("base_model.model.model.", "")
     print(f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}")
-    #print("Post-Output: ", output)
-    torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output")
+    print("Post-Input/Output: ", input[0].shape, output[0].shape)
+    torch.save(output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output")
     print("===")
     module.decoding_step += 1
 

From b11c5e9d81bfbc84073443ac69eb0376c1aad7c8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 8 Nov 2023 02:37:41 +0000
Subject: [PATCH 069/198] fix issue with save_inference_tensors_to_file

---
 include/flexflow/operator.h | 104 +++++++++++++++++++++++++++++++++-
 src/runtime/operator.cc     | 110 ------------------------------------
 2 files changed, 102 insertions(+), 112 deletions(-)

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index b827148a3a..df796a7879 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -9,6 +9,14 @@
 #include "flexflow/utils/dot/record_formatter.h"
 #include <vector>
 
+#include <sys/stat.h>
+#include <sys/types.h>
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
 namespace FlexFlow {
 
 extern LegionRuntime::Logger::Category log_measure;
@@ -234,13 +242,105 @@ class Op {
     assert(false);
   }
   virtual void print_layer(FFModel const &model) = 0;
+  template <typename OpMetaType>
   static void save_inference_tensors_to_file(
-      OpMeta *m,
+      OpMetaType *m,
       int shard_id,
       BatchConfig const *bc,
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
-      std::vector<GenericTensorAccessorW> output_tensors);
+      std::vector<GenericTensorAccessorW> output_tensors) {
+    // Check if output directory exists, and create it if it does not
+    char const *folder_path = "./inference_tensors";
+    struct stat st = {0};
+    if (stat(folder_path, &st) == -1) {
+      // Directory does not exist, create it
+      mkdir(folder_path, 0700);
+    }
+    // output base filepath, shared by all tensors from the same operator
+    std::string base_filepath =
+        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
+        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
+        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
+        m->op_name + "_shard-id_" + std::to_string(shard_id);
+    // save batch config, if passed
+    if (bc != nullptr) {
+      bc->save_to_file(base_filepath + "_batch-config");
+    }
+    // save all inputs
+    for (int i = 0; i < input_tensors.size(); i++) {
+      std::string filename = base_filepath + "_input_" + std::to_string(i);
+      if (input_tensors[i].data_type == DT_FLOAT) {
+        save_tensor(input_tensors[i].get_float_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (input_tensors[i].data_type == DT_HALF) {
+        save_tensor(input_tensors[i].get_half_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (input_tensors[i].data_type == DT_INT32) {
+        save_tensor(input_tensors[i].get_int32_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (input_tensors[i].data_type == DT_INT64) {
+        save_tensor(input_tensors[i].get_int64_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else {
+        assert(false && "Tensor data type not supported");
+      }
+    }
+    // only dump the weights once
+    if (m->decoding_step == 0) {
+      for (int i = 0; i < weight_tensors.size(); i++) {
+        std::string filename = base_filepath + "_weight_" + std::to_string(i);
+        if (weight_tensors[i].data_type == DT_FLOAT) {
+          save_tensor(weight_tensors[i].get_float_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else if (weight_tensors[i].data_type == DT_HALF) {
+          save_tensor(weight_tensors[i].get_half_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else if (weight_tensors[i].data_type == DT_INT32) {
+          save_tensor(weight_tensors[i].get_int32_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else if (weight_tensors[i].data_type == DT_INT64) {
+          save_tensor(weight_tensors[i].get_int64_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else {
+          assert(false && "Tensor data type not supported");
+        }
+      }
+    }
+    // save all outputs
+    for (int i = 0; i < output_tensors.size(); i++) {
+      std::string filename = base_filepath + "_output_" + std::to_string(i);
+      if (output_tensors[i].data_type == DT_FLOAT) {
+        save_tensor(output_tensors[i].get_float_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (output_tensors[i].data_type == DT_HALF) {
+        save_tensor(output_tensors[i].get_half_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (output_tensors[i].data_type == DT_INT32) {
+        save_tensor(output_tensors[i].get_int32_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (output_tensors[i].data_type == DT_INT64) {
+        save_tensor(output_tensors[i].get_int64_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else {
+        assert(false && "Tensor data type not supported");
+      }
+    }
+    // increase count of decoding steps
+    m->decoding_step++;
+  }
   virtual bool measure_operator_cost(Simulator *sim,
                                      MachineView const &mv,
                                      CostMetrics &cost_metrics) const = 0;
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index c60fa08814..08b1af8ca5 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -3,14 +3,6 @@
 #include "flexflow/simulator.h"
 #include <stdexcept>
 
-#include <sys/stat.h>
-#include <sys/types.h>
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "flexflow/utils/cuda_helper.h"
-#else
-#include "flexflow/utils/hip_helper.h"
-#endif
-
 namespace FlexFlow {
 
 size_t Op::get_untyped_params_hash() const {
@@ -25,106 +17,4 @@ size_t Op::get_params_hash() const {
       get_operator_type_name(this->op_type));
 }
 
-/*static*/
-template <typename OpMetaType>
-void Op::save_inference_tensors_to_file(
-    OpMetaType *m,
-    int shard_id,
-    BatchConfig const *bc,
-    std::vector<GenericTensorAccessorR> input_tensors,
-    std::vector<GenericTensorAccessorR> weight_tensors,
-    std::vector<GenericTensorAccessorW> output_tensors) {
-
-  // Check if output directory exists, and create it if it does not
-  char const *folder_path = "./inference_tensors";
-  struct stat st = {0};
-  if (stat(folder_path, &st) == -1) {
-    // Directory does not exist, create it
-    mkdir(folder_path, 0700);
-  }
-  // output base filepath, shared by all tensors from the same operator
-  std::string base_filepath =
-      "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-      "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
-      std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-      m->op_name + "_shard-id_" + std::to_string(shard_id);
-  // save batch config, if passed
-  if (bc != nullptr) {
-    bc->save_to_file(base_filepath + "_batch-config");
-  }
-  // save all inputs
-  for (int i = 0; i < input_tensors.size(); i++) {
-    std::string filename = base_filepath + "_input_" + std::to_string(i);
-    if (input_tensors[i].data_type == DT_FLOAT) {
-      save_tensor(input_tensors[i].get_float_ptr(),
-                  input_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else if (input_tensors[i].data_type == DT_HALF) {
-      save_tensor(input_tensors[i].get_half_ptr(),
-                  input_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else if (input_tensors[i].data_type == DT_INT32) {
-      save_tensor(input_tensors[i].get_int32_ptr(),
-                  input_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else if (input_tensors[i].data_type == DT_INT64) {
-      save_tensor(input_tensors[i].get_int64_ptr(),
-                  input_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else {
-      assert(false && "Tensor data type not supported");
-    }
-  }
-  // only dump the weights once
-  if (m->decoding_step == 0) {
-    for (int i = 0; i < weight_tensors.size(); i++) {
-      std::string filename = base_filepath + "_weight_" + std::to_string(i);
-      if (weight_tensors[i].data_type == DT_FLOAT) {
-        save_tensor(weight_tensors[i].get_float_ptr(),
-                    weight_tensors[i].domain.get_volume(),
-                    filename.c_str());
-      } else if (weight_tensors[i].data_type == DT_HALF) {
-        save_tensor(weight_tensors[i].get_half_ptr(),
-                    weight_tensors[i].domain.get_volume(),
-                    filename.c_str());
-      } else if (weight_tensors[i].data_type == DT_INT32) {
-        save_tensor(weight_tensors[i].get_int32_ptr(),
-                    weight_tensors[i].domain.get_volume(),
-                    filename.c_str());
-      } else if (weight_tensors[i].data_type == DT_INT64) {
-        save_tensor(weight_tensors[i].get_int64_ptr(),
-                    weight_tensors[i].domain.get_volume(),
-                    filename.c_str());
-      } else {
-        assert(false && "Tensor data type not supported");
-      }
-    }
-  }
-  // save all outputs
-  for (int i = 0; i < output_tensors.size(); i++) {
-    std::string filename = base_filepath + "_output_" + std::to_string(i);
-    if (output_tensors[i].data_type == DT_FLOAT) {
-      save_tensor(output_tensors[i].get_float_ptr(),
-                  output_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else if (output_tensors[i].data_type == DT_HALF) {
-      save_tensor(output_tensors[i].get_half_ptr(),
-                  output_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else if (output_tensors[i].data_type == DT_INT32) {
-      save_tensor(output_tensors[i].get_int32_ptr(),
-                  output_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else if (output_tensors[i].data_type == DT_INT64) {
-      save_tensor(output_tensors[i].get_int64_ptr(),
-                  output_tensors[i].domain.get_volume(),
-                  filename.c_str());
-    } else {
-      assert(false && "Tensor data type not supported");
-    }
-  }
-  // increase count of decoding steps
-  m->decoding_step++;
-}
-
 }; // namespace FlexFlow
\ No newline at end of file

From fca16ccf1b446fe89788f24ba0f36c6011891055 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 8 Nov 2023 05:02:32 +0000
Subject: [PATCH 070/198] fix layer names for save_inference_tensors_to_file

---
 include/flexflow/operator.h                   | 12 +++++++-
 .../ops/add_bias_residual_layer_norm_params.h |  1 +
 include/flexflow/ops/embedding_params.h       |  1 +
 .../ops/inc_multihead_self_attention_params.h |  1 +
 include/flexflow/ops/linear_params.h          |  1 +
 .../flexflow/ops/residual_layer_norm_params.h |  1 +
 .../flexflow/ops/residual_rms_norm_params.h   |  1 +
 include/flexflow/ops/rms_norm_params.h        |  1 +
 .../flexflow/ops/sigmoid_silu_multi_params.h  |  1 +
 ...spec_inc_multihead_self_attention_params.h |  1 +
 ...tree_inc_multihead_self_attention_params.h |  1 +
 inference/incr_decoding/incr_decoding.cc      |  8 ++++--
 src/ops/add_bias_residual_layer_norm.cc       | 12 +++++++-
 src/ops/inc_multihead_self_attention.cc       |  5 +++-
 src/ops/linear.cc                             | 12 +++++++-
 src/ops/lora_linear.cc                        |  2 +-
 src/ops/residual_layer_norm.cc                | 12 +++++++-
 src/ops/residual_rms_norm.cc                  | 12 +++++++-
 src/ops/rms_norm.cc                           | 13 ++++++++-
 src/ops/sigmoid_silu_multi.cc                 | 12 +++++++-
 src/ops/spec_inc_multihead_self_attention.cc  |  5 +++-
 src/ops/tree_inc_multihead_self_attention.cc  |  5 +++-
 src/runtime/graph.cc                          | 28 +++++++++++++++++++
 23 files changed, 135 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index df796a7879..388f9dcd6a 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -258,11 +258,21 @@ class Op {
       mkdir(folder_path, 0700);
     }
     // output base filepath, shared by all tensors from the same operator
+    std::string op_name_without_uid = std::string(m->op_name);
+    size_t last_underscore = op_name_without_uid.length() - 1;
+    for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+      if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+        break;
+      } else if (m->op_name[i] == '_') {
+        last_underscore = i;
+      }
+    }
+    op_name_without_uid.erase(last_underscore);
     std::string base_filepath =
         "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
         "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
         std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        m->op_name + "_shard-id_" + std::to_string(shard_id);
+        op_name_without_uid + "_shard-id_" + std::to_string(shard_id);
     // save batch config, if passed
     if (bc != nullptr) {
       bc->save_to_file(base_filepath + "_batch-config");
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
index 6f49983467..87fe2fb562 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams {
   bool elementwise_affine;
   float eps;
   bool use_bias;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
 };
diff --git a/include/flexflow/ops/embedding_params.h b/include/flexflow/ops/embedding_params.h
index 71e5cc8b20..d813132048 100644
--- a/include/flexflow/ops/embedding_params.h
+++ b/include/flexflow/ops/embedding_params.h
@@ -12,6 +12,7 @@ struct EmbeddingParams {
   LayerID layer_guid;
   AggrMode aggr;
   DataType data_type;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
index 7ae39f1cfe..58681069e2 100644
--- a/include/flexflow/ops/inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -16,6 +16,7 @@ struct IncMultiHeadSelfAttentionParams {
       scaling_query, qk_prod_scaling, position_bias;
   DataType quantization_type;
   bool offload;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 
diff --git a/include/flexflow/ops/linear_params.h b/include/flexflow/ops/linear_params.h
index 563304e89f..9a62ebd857 100644
--- a/include/flexflow/ops/linear_params.h
+++ b/include/flexflow/ops/linear_params.h
@@ -20,6 +20,7 @@ class LinearParams {
   float kernel_reg_lambda;
   DataType quantization_type;
   bool offload;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &input_shape) const;
   void solve_dims(const ParallelTensor input,
diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h
index 24da4a2c08..949ae0c799 100644
--- a/include/flexflow/ops/residual_layer_norm_params.h
+++ b/include/flexflow/ops/residual_layer_norm_params.h
@@ -13,6 +13,7 @@ struct ResidualLayerNormParams {
   float eps;
   bool use_bias;
   bool use_two_residuals;
+  char name[MAX_OPNAME];
   bool is_valid(std::tuple<ParallelTensorShape,
                            ParallelTensorShape,
                            ParallelTensorShape> const &) const;
diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h
index 64751a30b0..a4e4de59ab 100644
--- a/include/flexflow/ops/residual_rms_norm_params.h
+++ b/include/flexflow/ops/residual_rms_norm_params.h
@@ -11,6 +11,7 @@ struct ResidualRMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &input) const;
 };
diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h
index 81295322f0..2e4ceecf48 100644
--- a/include/flexflow/ops/rms_norm_params.h
+++ b/include/flexflow/ops/rms_norm_params.h
@@ -11,6 +11,7 @@ struct RMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 
diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h
index c8182505b3..eb152db5c1 100644
--- a/include/flexflow/ops/sigmoid_silu_multi_params.h
+++ b/include/flexflow/ops/sigmoid_silu_multi_params.h
@@ -8,6 +8,7 @@ namespace FlexFlow {
 
 struct SigmoidSiluMultiParams {
   LayerID layer_guid;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
 };
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index 2f7a706bf1..4d1d78b1dd 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -13,6 +13,7 @@ struct SpecIncMultiHeadSelfAttentionParams {
   float dropout, scaling_factor;
   bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
       scaling_query, qk_prod_scaling, position_bias;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
index 14fcde74ba..d1a51b8b8f 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
@@ -16,6 +16,7 @@ struct TreeIncMultiHeadSelfAttentionParams {
       scaling_query, qk_prod_scaling, position_bias;
   DataType quantization_type;
   bool offload;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 1921e05323..7ec574edf1 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -263,8 +263,12 @@ void FlexFlow::top_level_task(Task const *task,
       peft_model_name.empty()
           ? LoraLinearConfig::DefaultConfig
           : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
-  PEFTModelID peft_model_id = model.register_peft_model(
-      LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/);
+  PEFTModelID peft_model_id =
+      peft_model_name.empty()
+          ? PEFTModelID::NO_ID
+          : model.register_peft_model(
+                LoraLinearConfig::DefaultConfig /*mlp_first*/,
+                mlp_second /*mlp_second*/);
 
   int total_num_requests = 0;
   {
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 5d19dffdbc..ed682e81fc 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -58,6 +58,9 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const {
   params.elementwise_affine = this->elementwise_affine;
   params.eps = this->eps;
   params.use_bias = this->use_bias;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
@@ -213,7 +216,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
                                params.use_bias,
                                params.eps,
                                allocate_weights,
-                               name) {}
+                               params.name) {}
 
 AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
     FFModel &model,
@@ -1027,6 +1030,8 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->elementwise_affine);
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 using PCG::Node;
@@ -1055,6 +1060,10 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(elementwise_affine);
   dez.deserialize(eps);
   dez.deserialize(use_bias);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
 
   AddBiasResidualLayerNormParams params;
   params.layer_guid = layer_guid;
@@ -1062,6 +1071,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   params.elementwise_affine = elementwise_affine;
   params.eps = eps;
   params.use_bias = use_bias;
+  strcpy(params.name, name);
   return ff.get_or_create_node<AddBiasResidualLayerNorm>({inputs[0], inputs[1]},
                                                          params);
 }
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 5cf4dbdf7c..5e079bfb7f 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -567,7 +567,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 params.quantization_type,
                                 params.offload,
                                 params.tensor_parallelism_degree,
-                                name) {}
+                                params.name) {}
 
 void IncMultiHeadSelfAttention::init_inference(
     FFModel const &ff,
@@ -1055,6 +1055,9 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
   params.num_kv_heads = this->num_kv_heads;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
 
   return params;
 }
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index f8181570ce..2c8afb6eab 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -190,7 +190,7 @@ Linear::Linear(FFModel &model,
              params.quantization_type,
              params.offload,
              allocate_weights,
-             name) {}
+             params.name) {}
 
 Linear::Linear(FFModel &model,
                LayerID const &_layer_guid,
@@ -1354,6 +1354,8 @@ void Linear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->data_type);
   sez.serialize(this->quantization_type);
   sez.serialize(this->offload);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 /* static */
@@ -1384,6 +1386,10 @@ Node Linear::deserialize(FFModel &ff,
   dez.deserialize(data_type);
   dez.deserialize(quantization_type);
   dez.deserialize(offload);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
 
   LinearParams params;
   params.activation = activation;
@@ -1395,6 +1401,7 @@ Node Linear::deserialize(FFModel &ff,
   params.layer_guid = layer_guid;
   params.quantization_type = quantization_type;
   params.offload = offload;
+  strcpy(params.name, name);
   return ff.get_or_create_node<Linear>(inputs[0], params);
 }
 
@@ -1409,6 +1416,9 @@ LinearParams Linear::get_params() const {
   params.kernel_reg_lambda = this->kernel_reg_lambda;
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
 
   return params;
 }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 4054173c2f..bcdf61b54e 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -674,7 +674,7 @@ Node LoraLinear::deserialize(FFModel &ff,
   size_t id, transformer_layer_id, deserialized_model_id;
   OperatorType op_type;
   size_t name_len;
-  char name[MAX_OPNAME];
+  char name[MAX_OPNAME] = {0};
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 754b6105fa..1bfd52d107 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -63,6 +63,9 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const {
   params.eps = this->eps;
   params.use_bias = this->use_bias;
   params.use_two_residuals = this->use_two_residuals;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
@@ -228,7 +231,7 @@ ResidualLayerNorm::ResidualLayerNorm(
                         params.use_bias,
                         params.eps,
                         allocate_weights,
-                        name) {}
+                        params.name) {}
 
 ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
                                      LayerID const &_layer_guid,
@@ -1069,6 +1072,8 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
   sez.serialize(this->use_two_residuals);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 using PCG::Node;
@@ -1098,6 +1103,10 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(eps);
   dez.deserialize(use_bias);
   dez.deserialize(use_two_residuals);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
   if (use_two_residuals) {
     assert(num_inputs == 3);
   } else {
@@ -1111,6 +1120,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   params.eps = eps;
   params.use_bias = use_bias;
   params.use_two_residuals = use_two_residuals;
+  strcpy(params.name, name);
   if (use_two_residuals) {
     return ff.get_or_create_node<ResidualLayerNorm>(
         {inputs[0], inputs[1], inputs[2]}, params);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index a6ed1dca9b..1f05c9bf4d 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -55,6 +55,9 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
@@ -141,7 +144,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       params.eps,
                       params.dim,
                       allocate_weights,
-                      name) {}
+                      params.name) {}
 
 ResidualRMSNorm::ResidualRMSNorm(
     FFModel &model,
@@ -460,6 +463,8 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->eps);
   sez.serialize(this->dim);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 using PCG::Node;
@@ -478,10 +483,15 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   dez.deserialize(eps);
   dez.deserialize(dim);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
   ResidualRMSNormParams params;
   params.layer_guid = layer_guid;
   params.eps = eps;
   params.dim = dim;
+  strcpy(params.name, name);
   return ff.get_or_create_node<ResidualRMSNorm>({inputs[0], inputs[1]}, params);
 }
 
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 1a9bd7704e..0d7cc3b7af 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -53,6 +53,9 @@ RMSNormParams RMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
@@ -583,6 +586,8 @@ void RMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->eps);
   sez.serialize(this->dim);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 using PCG::Node;
@@ -602,10 +607,16 @@ Node RMSNorm::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   dez.deserialize(eps);
   dez.deserialize(dim);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
   RMSNormParams params;
   params.layer_guid = layer_guid;
   params.eps = eps;
   params.dim = dim;
+  strcpy(params.name, name);
+
   return ff.get_or_create_node<RMSNorm>(inputs[0], params);
 }
 
@@ -613,7 +624,7 @@ Op *RMSNorm::materialize(FFModel &ff,
                          ParallelTensor inputs[],
                          int num_inputs) const {
   RMSNormParams params = get_params();
-  return new RMSNorm(ff, params, inputs[0], true, this->name);
+  return new RMSNorm(ff, params, inputs[0], true, params.name);
 }
 
 bool RMSNorm::measure_operator_cost(Simulator *sim,
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index e36eb36d31..b3771ea267 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -52,6 +52,9 @@ bool SigmoidSiluMultiParams::is_valid(
 SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
   SigmoidSiluMultiParams params;
   params.layer_guid = this->layer_guid;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
@@ -110,7 +113,7 @@ SigmoidSiluMulti::SigmoidSiluMulti(
     std::pair<ParallelTensor, ParallelTensor> const &inputs,
     char const *name)
     : SigmoidSiluMulti(
-          model, params.layer_guid, inputs.first, inputs.second, name) {}
+          model, params.layer_guid, inputs.first, inputs.second, params.name) {}
 
 SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model,
                                    LayerID const &_layer_guid,
@@ -532,6 +535,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
 }
 
 using PCG::Node;
@@ -546,9 +551,14 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff,
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
 
   SigmoidSiluMultiParams params;
   params.layer_guid = layer_guid;
+  strcpy(params.name, name);
   return ff.get_or_create_node<SigmoidSiluMulti>({inputs[0], inputs[1]},
                                                  params);
 }
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index eb6fd721e6..d4b74f20ae 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -511,7 +511,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.qk_prod_scaling,
                                     params.position_bias,
                                     allocate_weights,
-                                    name) {}
+                                    params.name) {}
 
 void SpecIncMultiHeadSelfAttention::init_inference(
     FFModel const &ff,
@@ -853,6 +853,9 @@ SpecIncMultiHeadSelfAttentionParams
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
 
   return params;
 }
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index d5a8a1063d..d0efb01d54 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -562,7 +562,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     params.quantization_type,
                                     params.offload,
                                     params.tensor_parallelism_degree,
-                                    name) {}
+                                    params.name) {}
 
 void TreeIncMultiHeadSelfAttention::init_inference(
     FFModel const &ff,
@@ -927,6 +927,9 @@ TreeIncMultiHeadSelfAttentionParams
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
   params.tensor_parallelism_degree = this->tensor_parallelism_degree;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
   return params;
 }
 
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index b58990d32e..cc626c1b42 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2298,6 +2298,8 @@ GraphOptimalViewSerialized
         sez.serialize(embed->out_channels);
         sez.serialize(embed->aggr);
         sez.serialize(embed->data_type);
+        sez.serialize(strlen(embed->name));
+        sez.serialize(embed->name, strlen(embed->name));
         break;
       }
       case OP_MULTIHEAD_ATTENTION: {
@@ -2337,6 +2339,8 @@ GraphOptimalViewSerialized
         sez.serialize(attn->offload);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(attn->tensor_parallelism_degree);
+        sez.serialize(strlen(attn->name));
+        sez.serialize(attn->name, strlen(attn->name));
         break;
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -2359,6 +2363,8 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qk_prod_scaling);
         sez.serialize(attn->position_bias);
         sez.serialize(attn->num_kv_heads);
+        sez.serialize(strlen(attn->name));
+        sez.serialize(attn->name, strlen(attn->name));
         break;
       }
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -2384,6 +2390,8 @@ GraphOptimalViewSerialized
         sez.serialize(attn->offload);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(attn->tensor_parallelism_degree);
+        sez.serialize(strlen(attn->name));
+        sez.serialize(attn->name, strlen(attn->name));
         break;
       }
       case OP_SOFTMAX: {
@@ -2656,6 +2664,10 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(out_channels);
         dez.deserialize(aggr);
         dez.deserialize(data_type);
+        size_t name_len;
+        char name[MAX_OPNAME] = {0};
+        dez.deserialize(name_len);
+        dez.deserialize(name, name_len);
 
         EmbeddingParams params;
         params.aggr = aggr;
@@ -2663,6 +2675,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.out_channels = out_channels;
         params.layer_guid = layer_guid;
         params.data_type = data_type;
+        strcpy(params.name, name);
         node = get_or_create_node<Embedding>(inputs[0], params);
         break;
       }
@@ -2798,6 +2811,10 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(offload);
         dez.deserialize(num_kv_heads);
         dez.deserialize(tensor_parallelism_degree);
+        size_t name_len;
+        char name[MAX_OPNAME] = {0};
+        dez.deserialize(name_len);
+        dez.deserialize(name, name_len);
 
         IncMultiHeadSelfAttentionParams params;
         params.embed_dim = embed_dim;
@@ -2818,6 +2835,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.offload = offload;
         params.num_kv_heads = num_kv_heads;
         params.tensor_parallelism_degree = tensor_parallelism_degree;
+        strcpy(params.name, name);
         node = get_or_create_node<IncMultiHeadSelfAttention>(inputs[0], params);
         break;
       }
@@ -2846,6 +2864,10 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qk_prod_scaling);
         dez.deserialize(position_bias);
         dez.deserialize(num_kv_heads);
+        size_t name_len;
+        char name[MAX_OPNAME] = {0};
+        dez.deserialize(name_len);
+        dez.deserialize(name, name_len);
 
         SpecIncMultiHeadSelfAttentionParams params;
         params.embed_dim = embed_dim;
@@ -2863,6 +2885,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.qk_prod_scaling = qk_prod_scaling;
         params.position_bias = position_bias;
         params.num_kv_heads = num_kv_heads;
+        strcpy(params.name, name);
         node = get_or_create_node<SpecIncMultiHeadSelfAttention>(inputs[0],
                                                                  params);
         break;
@@ -2897,6 +2920,10 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(offload);
         dez.deserialize(num_kv_heads);
         dez.deserialize(tensor_parallelism_degree);
+        size_t name_len;
+        char name[MAX_OPNAME] = {0};
+        dez.deserialize(name_len);
+        dez.deserialize(name, name_len);
 
         TreeIncMultiHeadSelfAttentionParams params;
         params.embed_dim = embed_dim;
@@ -2917,6 +2944,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.offload = offload;
         params.num_kv_heads = num_kv_heads;
         params.tensor_parallelism_degree = tensor_parallelism_degree;
+        strcpy(params.name, name);
         node = get_or_create_node<TreeIncMultiHeadSelfAttention>(inputs[0],
                                                                  params);
         break;

From 9095f2b5ab2c4528581afd5bb8c0371284f5b78f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 9 Nov 2023 20:39:20 +0000
Subject: [PATCH 071/198] fix peft

---
 include/flexflow/batch_config.h         | 2 +-
 src/ops/add_bias_residual_layer_norm.cc | 5 +++++
 src/ops/fused.cu                        | 4 ++--
 src/ops/inc_multihead_self_attention.cc | 2 +-
 src/ops/layer_norm.cc                   | 5 +++++
 src/ops/linear.cc                       | 2 +-
 src/ops/lora_linear.cc                  | 2 +-
 src/ops/residual_layer_norm.cc          | 4 ++++
 src/ops/residual_rms_norm.cc            | 3 +++
 src/ops/rms_norm.cc                     | 3 +++
 src/ops/sigmoid_silu_multi.cc           | 2 +-
 src/ops/softmax.cc                      | 2 +-
 src/parallel_ops/allreduce.cc           | 4 +++-
 src/runtime/batch_config.cc             | 2 +-
 src/runtime/request_manager.cc          | 8 +++++---
 15 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 60ca550898..a592674b6e 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -61,7 +61,7 @@ class BatchConfig {
   static int const MAX_NUM_TOKENS = 1024;
 
   //  Set by update
-  int num_tokens;
+  int num_tokens, num_peft_tokens;
 
   struct PerRequestInfo {
     PerRequestInfo() {
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index ed682e81fc..2ce2056050 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -917,6 +917,7 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   int field_id = 0;
   // output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
@@ -963,6 +964,10 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
   assert(task->regions.size() == regions.size());
   AddBiasResidualLayerNormMeta const *m =
       *((AddBiasResidualLayerNormMeta **)task->local_args);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index e44b9df951..1cb17ec20e 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -699,8 +699,8 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
   FusedOp const *fused = metas->fused_op;
   // BatchConfig const *bc = (BatchConfig *)task->args;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  // Return if no active tokens
-  if (bc->num_active_tokens() == 0) {
+  // Return if no active PEFT bwd tokens
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
 
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 5e079bfb7f..d2c1209ade 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -948,7 +948,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
   log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
                     bc->num_tokens,
                     bc->num_active_requests());
-  if (bc->num_tokens == 0) {
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
 
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 0a467f0984..ba2d43022f 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -669,6 +669,7 @@ Legion::FutureMap
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   // regions[0](I): output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
@@ -704,6 +705,10 @@ void LayerNorm::peft_bwd_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,
                               Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
   LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
   assert(task->regions.size() == regions.size());
 
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 2c8afb6eab..86f958a433 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -736,7 +736,7 @@ void Linear::peft_bwd_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   LinearMeta const *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
   assert(regions.size() == (3 + static_cast<size_t>(m->use_bias)));
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index bcdf61b54e..5870243ade 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -609,7 +609,7 @@ void LoraLinear::peft_bwd_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
   assert(regions.size() == 6);
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 1bfd52d107..e3b599d10f 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -758,6 +758,10 @@ void ResidualLayerNorm::peft_bwd_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
   assert(task->regions.size() == regions.size());
   ResidualLayerNormMeta const *m =
       *((ResidualLayerNormMeta **)task->local_args);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 1f05c9bf4d..8013c0e81a 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -672,6 +672,9 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
   assert(regions.size() == 4);
   ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW residual_input0_grad =
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 0d7cc3b7af..fe6944aa90 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -571,6 +571,9 @@ void RMSNorm::peft_bwd_task(Task const *task,
   assert(regions.size() == 3);
   RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index b3771ea267..14c202f784 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -411,7 +411,7 @@ void SigmoidSiluMulti::peft_bwd_task(Task const *task,
 
   SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_active_peft_tokens() <= 0) {
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
 
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index d0e38c8017..ae75849f85 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -431,7 +431,7 @@ void Softmax::peft_bwd_task(Task const *task,
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
   Domain in_domain = runtime->get_index_space_domain(
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 62e152b36c..7f147dad6f 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -387,7 +387,9 @@ void AllReduce::peft_bwd_task(Task const *task,
 
   AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 22ab420674..20c0307a58 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -79,7 +79,7 @@ int BatchConfig::num_active_infr_tokens() const {
 }
 
 int BatchConfig::num_active_peft_tokens() const {
-  return 0;
+  return num_peft_tokens;
 }
 
 /*static*/
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index df8d43bc38..e8adfcbded 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -607,17 +607,19 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       for (size_t i = 0; i < request.dataset[0].first.size(); i++) {
         new_bc.tokensInfo[new_bc.num_tokens].token_id =
             request.dataset[0].first[i];
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens;
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i;
         new_bc.num_tokens++;
+        new_bc.num_peft_tokens++;
       }
       for (size_t i = 0; i < request.dataset[0].second.size(); i++) {
         new_bc.tokensInfo[new_bc.num_tokens].token_id =
             request.dataset[0].second[i];
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens;
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx;
         int depth = request.dataset[0].first.size() + i;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
         new_bc.num_tokens++;
+        new_bc.num_peft_tokens++;
       }
     }
   }
@@ -2119,7 +2121,7 @@ GenerationResult RequestManager::generate_incr_decoding(
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second);
     FutureMap fm = im->inference(llm, 0, bcf);
-    // im->peft_bwd(llm, 0, bcf);
+    im->peft_bwd(llm, 0, bcf);
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(std::make_pair(bcf, irf));

From 97696041181d32679e1d1d0a8d7cf3cc2e1b8a97 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 03:43:02 +0000
Subject: [PATCH 072/198] fix bwd bugs

---
 src/ops/inc_multihead_self_attention.cu |  7 ++++++-
 src/ops/kernels/linear_kernels.cu       | 12 +++++++++---
 src/runtime/cuda_helper.cu              | 22 ++++++++++++++++++++++
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index c406435327..1a93251db4 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -469,8 +469,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
 #else
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index dad6dc4e00..6f4016f2c2 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -457,14 +457,20 @@ void peft_bwd_kernel(LinearMeta const *m,
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   // update input_grad_ptr and output_grad_ptr offset
-  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens * in_dim;
+  int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
+  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
   output_grad_ptr =
-      static_cast<DT *>(output_grad_ptr) + num_infr_tokens * out_dim;
+      static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = output_type;
 #else
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
 #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index c2b2affc40..e2078fa663 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -608,6 +608,28 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) {
   return CUDNN_DATA_FLOAT;
 }
 
+void check_device_vs_host_ptr(void const *maybe_devicePtr) {
+  cudaPointerAttributes attributes;
+  cudaError_t cudaStatus = cudaPointerGetAttributes(&attributes, maybe_devicePtr);
+
+  if (cudaStatus == cudaSuccess) {
+    // Check attributes and perform actions accordingly
+    if (attributes.type == cudaMemoryTypeDevice) {
+      printf("Pointer is allocated in device memory.\n");
+    } else if (attributes.type == cudaMemoryTypeHost) {
+      printf("Pointer is allocated in host memory.\n");
+    } else if (attributes.type == cudaMemoryTypeUnregistered) {
+      printf("Pointer is unregistered.\n");
+    } else if (attributes.type == cudaMemoryTypeManaged) {
+      printf("Pointer is managed.\n");
+    } else {
+      printf("Pointer is not allocated in recognized memory type.\n");
+    }
+  } else {
+    fprintf(stderr, "cudaPointerGetAttributes failed: %s\n", cudaGetErrorString(cudaStatus));
+  }
+} 
+
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
 template __global__ void

From 880ede8541a26c976dcb57e9e7655fb93ed8d67f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 03:43:28 +0000
Subject: [PATCH 073/198] linting

---
 src/ops/kernels/linear_kernels.cu | 3 ++-
 src/runtime/cuda_helper.cu        | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 6f4016f2c2..e56c4124d6 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -458,7 +458,8 @@ void peft_bwd_kernel(LinearMeta const *m,
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   // update input_grad_ptr and output_grad_ptr offset
   int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
-  input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
+  input_grad_ptr =
+      static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
   output_grad_ptr =
       static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index e2078fa663..0de6d9bc63 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -610,7 +610,8 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) {
 
 void check_device_vs_host_ptr(void const *maybe_devicePtr) {
   cudaPointerAttributes attributes;
-  cudaError_t cudaStatus = cudaPointerGetAttributes(&attributes, maybe_devicePtr);
+  cudaError_t cudaStatus =
+      cudaPointerGetAttributes(&attributes, maybe_devicePtr);
 
   if (cudaStatus == cudaSuccess) {
     // Check attributes and perform actions accordingly
@@ -626,9 +627,11 @@ void check_device_vs_host_ptr(void const *maybe_devicePtr) {
       printf("Pointer is not allocated in recognized memory type.\n");
     }
   } else {
-    fprintf(stderr, "cudaPointerGetAttributes failed: %s\n", cudaGetErrorString(cudaStatus));
+    fprintf(stderr,
+            "cudaPointerGetAttributes failed: %s\n",
+            cudaGetErrorString(cudaStatus));
   }
-} 
+}
 
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);

From 818375de38e6fa6d0f0495901ba04a6a592102c5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 03:53:50 +0000
Subject: [PATCH 074/198] fixes

---
 inference/incr_decoding/incr_decoding.cc | 8 ++++----
 src/ops/inc_multihead_self_attention.cu  | 2 +-
 src/ops/lora_linear.cc                   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 7ec574edf1..ed2b4705df 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -280,16 +280,16 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
     std::vector<std::string> prompts;
-    // std::vector<std::pair<std::string, std::string>> dataset;
+    std::vector<std::pair<std::string, std::string>> dataset;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
       prompts.push_back(text);
-      // dataset.push_back(std::make_pair(text, text));
+      dataset.push_back(std::make_pair(text, text));
     }
-    // rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/,
-    // peft_model_id);
+    rm->register_new_peft_request(
+        dataset, 256 /*max_sequence_length*/, peft_model_id);
     //  for (auto &prompt : prompts) {
     //    GenerationResult result = model.generate(prompt, 128
     //    /*max_sequence_length*/);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 1a93251db4..92a1f37097 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1294,7 +1294,7 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
                                                     bias_ptr,
                                                     stream);
   } else if (input_grad.data_type == DT_FLOAT) {
-    assert(m->offload);
+    assert(!m->offload);
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 5870243ade..eb14517fab 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -612,7 +612,7 @@ void LoraLinear::peft_bwd_task(Task const *task,
   if (bc->num_active_peft_tokens() == 0) {
     return;
   }
-  assert(regions.size() == 6);
+  assert(regions.size() == 2);
   assert(task->regions.size() == regions.size());
   assert(m->input_type[0] == m->output_type[0]);
 

From 2990e2054781af0fa67c1114fb192cc798322e9e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 04:02:06 +0000
Subject: [PATCH 075/198] fix

---
 src/ops/rms_norm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index fe6944aa90..1e1de42b9a 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -577,9 +577,9 @@ void RMSNorm::peft_bwd_task(Task const *task,
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight);
 }
 

From 6959e6864b1f5ad184890dd715a7caf58d10d049 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 04:23:04 +0000
Subject: [PATCH 076/198] fix

---
 src/ops/rms_norm.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 1e1de42b9a..e6df27d49a 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -523,7 +523,10 @@ Legion::FutureMap
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
   IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID,
                          parallel_is,
                          TaskArgument(NULL, 0),
@@ -531,7 +534,7 @@ Legion::FutureMap
                          Predicate::TRUE_PRED,
                          false /*must*/,
                          0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
+                         machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): output_grad
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,

From 266368c69fbb7fcd8d02e06bce33cd22414f98f3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 04:51:54 +0000
Subject: [PATCH 077/198] fix

---
 include/flexflow/utils/cuda_helper.h |  1 +
 src/ops/kernels/rms_norm_kernels.cu  | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index 983c20525e..999bc27634 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -182,3 +182,4 @@ ncclDataType_t ff_to_nccl_datatype(DataType type);
 cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type);
 cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type);
 #endif
+void check_device_vs_host_ptr(void const *maybe_devicePtr);
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 8281506cbf..c9e0e02678 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -41,11 +41,15 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
 
   DataType data_type = rms->weights[0]->data_type;
   size_t rms_ptr_size = batch_size;
+  size_t c2_ptr_size = rms_ptr_size;
   size_t norm_ptr_size = num_elements;
-  size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
+  size_t totalSize =
+      (rms_ptr_size + c2_ptr_size + norm_ptr_size) * data_type_size(data_type);
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
+  c2_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      c2_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
 }
@@ -473,6 +477,11 @@ void peft_bwd_kernel(RMSNormMeta const *m,
 
     const int64_t M = bc->requestsInfo[i].num_tokens_in_batch;
     const int64_t N = m->num_elements;
+    check_device_vs_host_ptr(output_grad_ptr);
+    check_device_vs_host_ptr(m->input_activation);
+    check_device_vs_host_ptr(weight_ptr);
+    check_device_vs_host_ptr(m->rms_ptr);
+    check_device_vs_host_ptr(m->c2_ptr);
     ComputeInternalGradientsCUDAKernel<T>
         <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
             N,

From 06775bdd7af17d111893ed5d3c59fe86c015862d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 15:01:47 +0000
Subject: [PATCH 078/198] add bc fields for peft training

---
 include/flexflow/request_manager.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a955eb0b9f..fa8c8ebeb7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -57,6 +57,10 @@ struct Request {
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
   };
+  enum RequestType {
+    REQ_INFERENCE = 201,
+    REQ_FINETUNING = 202
+  };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id;
   int max_sequence_length;
@@ -68,6 +72,9 @@ struct Request {
   std::vector<BatchConfig::TokenId> tokens;
   std::vector<struct BeamTree> beam_trees;
   // PEFT field
+  RequestType req_type = REQ_INFERENCE;
+  int completed_training_steps = 0;
+  int max_training_steps = 1;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
       dataset;

From 9f601770949faa84407cbad10bdb03717bef93c0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 15:31:19 +0000
Subject: [PATCH 079/198] linting

---
 include/flexflow/request_manager.h |   5 +-
 src/runtime/request_manager.cc     | 113 ++++++++++++++++-------------
 2 files changed, 63 insertions(+), 55 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index fa8c8ebeb7..0aa654f9e7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -57,10 +57,7 @@ struct Request {
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
   };
-  enum RequestType {
-    REQ_INFERENCE = 201,
-    REQ_FINETUNING = 202
-  };
+  enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id;
   int max_sequence_length;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f2b9c1ee52..faf99f37e5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -425,7 +425,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     if (request.req_type == Request::REQ_FINETUNING) {
       // No new tokens generated when in fine-tuning mode
       continue;
-    } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
+    } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 <
+               request.tokens.size()) {
       // This is a prompt token
       continue;
     } else {
@@ -449,31 +450,34 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
       if (request.req_type == Request::REQ_FINETUNING) {
-        // fine-tuning requests don't automatically carry over to the next batch, 
-        // we only do so if there is space left after adding new inference requests
+        // fine-tuning requests don't automatically carry over to the next
+        // batch, we only do so if there is space left after adding new
+        // inference requests
         request.completed_training_steps += 1;
         assert(request.completed_training_steps <= request.max_training_steps);
         if (request.completed_training_steps == request.max_training_steps) {
           // check if the fine tuning request has completed
           request.status = Request::COMPLETED;
           log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%zu)",
-                          old_bc.requestsInfo[i].request_guid,
-                          request.completed_training_steps);
+                            old_bc.requestsInfo[i].request_guid,
+                            request.completed_training_steps);
           GenerationResult &gr = request_generation_results[request.guid];
           assert(gr.guid == request.guid);
           num_processed_requests++;
           ProfileInfo profile_info = profiling_requests[request.guid];
-          profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+          profile_info.finish_time =
+              Realm::Clock::current_time_in_microseconds();
           total_request_run_time +=
               profile_info.finish_time - profile_info.start_time;
           profiling_requests[request.guid] = profile_info;
-          log_req_mgr.print("[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) "
-                            "finish(%.1lf) latency(%.1lf)",
-                            request.guid,
-                            profile_info.completed_training_steps,
-                            profile_info.start_time,
-                            profile_info.finish_time,
-                            profile_info.finish_time - profile_info.start_time);
+          log_req_mgr.print(
+              "[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) "
+              "finish(%.1lf) latency(%.1lf)",
+              request.guid,
+              profile_info.completed_training_steps,
+              profile_info.start_time,
+              profile_info.finish_time,
+              profile_info.finish_time - profile_info.start_time);
         }
       } else {
         int processed_tokens =
@@ -482,7 +486,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         assert(processed_tokens < request.tokens.size());
         bool request_completed = false;
         // printf("model_type = %d\n", this->model_type);
-        if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+        if (request.tokens.size() >=
+            old_bc.requestsInfo[i].max_sequence_length) {
           request_completed = true;
         } else if (request.tokens.back() == eos_token_id) {
           // Encounter EOS token id
@@ -511,47 +516,51 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           log_req_mgr.print("Final output: %s", output.c_str());
           num_processed_requests++;
           ProfileInfo profile_info = profiling_requests[request.guid];
-          profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+          profile_info.finish_time =
+              Realm::Clock::current_time_in_microseconds();
           total_request_run_time +=
               profile_info.finish_time - profile_info.start_time;
           profiling_requests[request.guid] = profile_info;
-          log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) "
-                            "finish(%.1lf) latency(%.1lf)",
-                            request.guid,
-                            profile_info.decoding_steps,
-                            profile_info.start_time,
-                            profile_info.finish_time,
-                            profile_info.finish_time - profile_info.start_time);
+          log_req_mgr.print(
+              "[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) "
+              "finish(%.1lf) latency(%.1lf)",
+              request.guid,
+              profile_info.decoding_steps,
+              profile_info.start_time,
+              profile_info.finish_time,
+              profile_info.finish_time - profile_info.start_time);
           // Write output to file if needed:
           if (!output_filepath.empty()) {
-          std::ofstream outputFile(output_filepath, std::ios::app);
-          if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: " << profile_info.decoding_steps
-                       << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
+            std::ofstream outputFile(output_filepath, std::ios::app);
+            if (outputFile.is_open()) {
+              outputFile << "end-to-end latency: " << std::fixed
+                         << std::setprecision(3) << total_request_run_time
+                         << std::endl;
+              outputFile << "num decoding steps: "
+                         << profile_info.decoding_steps << std::endl;
+              outputFile << "token IDs: ";
+              for (int i = 0; i < request.tokens.size(); i++) {
+                outputFile << request.tokens[i];
+                if (i < request.tokens.size() - 1) {
+                  outputFile << ",";
+                }
               }
+              outputFile << std::endl;
+              outputFile << output;
+              outputFile.close();
+            } else {
+              std::cout << "Unable to open the output file: " << output_filepath
+                        << std::endl;
+              assert(false);
             }
-            outputFile << std::endl;
-            outputFile << output;
-            outputFile.close();
-          } else {
-            std::cout << "Unable to open the output file: " << output_filepath
-                      << std::endl;
-            assert(false);
           }
-        }
 
         } else {
           new_bc.request_completed[i] = false;
-          new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-          new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
+          new_bc.requestsInfo[i].first_token_depth_in_request =
+              processed_tokens;
+          new_bc.requestsInfo[i].first_token_offset_in_batch =
+              new_bc.num_tokens;
           new_bc.requestsInfo[i].request_guid =
               old_bc.requestsInfo[i].request_guid;
           new_bc.requestsInfo[i].peft_model_id =
@@ -565,17 +574,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             new_bc.requestsInfo[i].num_tokens_in_batch = 1;
           } else {
             // Prompt phase
-            new_bc.requestsInfo[i].num_tokens_in_batch =
-                std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                        (int)request.tokens.size() -
-                            new_bc.requestsInfo[i].first_token_depth_in_request);
+            new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
+                get_max_tokens_per_batch() - new_bc.num_tokens,
+                (int)request.tokens.size() -
+                    new_bc.requestsInfo[i].first_token_depth_in_request);
           }
           for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
             int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
             new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
             new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
             assert(depth < request.tokens.size());
-            new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
+            new_bc.tokensInfo[new_bc.num_tokens].token_id =
+                request.tokens[depth];
             new_bc.num_tokens++;
           }
           // Update profiling
@@ -625,7 +635,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
 
   // Step 4: add PEFT bwd requests, if there is additional space
-  while(pending_peft_request_queue.size() > 0) {
+  while (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
     assert(request.req_type = Request::REQ_FINETUNING);
     if (request.status == Request::COMPLETED) {
@@ -638,7 +648,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     Request &request = pending_peft_request_queue.front();
     assert(request.req_type = Request::REQ_FINETUNING);
     assert(request.dataset.size() > 0);
-    assert(request.max_training_steps > 0 && request.completed_training_steps < max_training_steps);
+    assert(request.max_training_steps > 0 &&
+           request.completed_training_steps < max_training_steps);
     int num_peft_tokens =
         request.dataset[0].first.size() + request.dataset[0].second.size();
     if (num_peft_tokens + new_bc.num_active_tokens() <=

From 9442b62c40831fd008a489a946032284d4e2e281 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 15:36:33 +0000
Subject: [PATCH 080/198] fix

---
 src/runtime/request_manager.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index faf99f37e5..a224f400c6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -458,7 +458,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         if (request.completed_training_steps == request.max_training_steps) {
           // check if the fine tuning request has completed
           request.status = Request::COMPLETED;
-          log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%zu)",
+          log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)",
                             old_bc.requestsInfo[i].request_guid,
                             request.completed_training_steps);
           GenerationResult &gr = request_generation_results[request.guid];
@@ -474,7 +474,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
               "[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) "
               "finish(%.1lf) latency(%.1lf)",
               request.guid,
-              profile_info.completed_training_steps,
+              request.completed_training_steps,
               profile_info.start_time,
               profile_info.finish_time,
               profile_info.finish_time - profile_info.start_time);
@@ -599,10 +599,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   // Step 3: add new requests to the next batch if there is space
   for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
@@ -649,7 +649,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     assert(request.req_type = Request::REQ_FINETUNING);
     assert(request.dataset.size() > 0);
     assert(request.max_training_steps > 0 &&
-           request.completed_training_steps < max_training_steps);
+           request.completed_training_steps < request.max_training_steps);
     int num_peft_tokens =
         request.dataset[0].first.size() + request.dataset[0].second.size();
     if (num_peft_tokens + new_bc.num_active_tokens() <=

From 11eccb1d269792b390505411bf2d7e83ddb4dd9b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 15:38:04 +0000
Subject: [PATCH 081/198] remove ptr check

---
 src/ops/kernels/rms_norm_kernels.cu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index c9e0e02678..ae6a5d590d 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -477,11 +477,6 @@ void peft_bwd_kernel(RMSNormMeta const *m,
 
     const int64_t M = bc->requestsInfo[i].num_tokens_in_batch;
     const int64_t N = m->num_elements;
-    check_device_vs_host_ptr(output_grad_ptr);
-    check_device_vs_host_ptr(m->input_activation);
-    check_device_vs_host_ptr(weight_ptr);
-    check_device_vs_host_ptr(m->rms_ptr);
-    check_device_vs_host_ptr(m->c2_ptr);
     ComputeInternalGradientsCUDAKernel<T>
         <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
             N,

From 9bfc557eafd7c7366b258cd76981de8a19734e7c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 Nov 2023 16:01:44 +0000
Subject: [PATCH 082/198] fix

---
 src/runtime/request_manager.cc | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a224f400c6..b62172eac3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -416,7 +416,6 @@ BatchConfig RequestManager::prepare_next_batch_task(
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
   // Step 1: append result from previous iteration to request's tokens
   for (int i = 0; i < old_bc.num_tokens; i++) {
     size_t guid =
@@ -638,7 +637,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   while (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
     assert(request.req_type = Request::REQ_FINETUNING);
-    if (request.status == Request::COMPLETED) {
+    Request &all_req_handle = all_requests[request.guid];
+    assert(all_req_handle.req_type = Request::REQ_FINETUNING);
+    if (all_req_handle.status == Request::COMPLETED) {
       pending_peft_request_queue.pop();
     } else {
       break;
@@ -648,6 +649,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     Request &request = pending_peft_request_queue.front();
     assert(request.req_type = Request::REQ_FINETUNING);
     assert(request.dataset.size() > 0);
+    // update status and training steps
+    Request &all_req_handle = all_requests[request.guid];
+    assert(all_req_handle.req_type = Request::REQ_FINETUNING);
+    request.completed_training_steps = all_req_handle.completed_training_steps;
+    request.status = all_req_handle.status;
+    assert(request.status != Request::COMPLETED);
     assert(request.max_training_steps > 0 &&
            request.completed_training_steps < request.max_training_steps);
     int num_peft_tokens =

From bcfae08f4f15a4f53473fd5ee8cdc58d3379e8fe Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 12 Nov 2023 21:12:16 +0000
Subject: [PATCH 083/198] implement save_operators for bwd

---
 include/flexflow/op_meta.h     |  1 +
 include/flexflow/operator.h    | 15 ++++++++++-----
 src/runtime/model.cc           |  2 ++
 src/runtime/request_manager.cc |  3 +++
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index dcf070c975..d31c12b16c 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -17,6 +17,7 @@ class OpMeta {
   bool profiling; // Measure the run time of the task
   bool inference_debugging;
   int decoding_step;
+  int bwd_step;
   char op_name[MAX_OPNAME];
   LayerID layer_guid;
   bool trainable_inputs[MAX_NUM_INPUTS];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 388f9dcd6a..9d54996bf0 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -249,7 +249,8 @@ class Op {
       BatchConfig const *bc,
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
-      std::vector<GenericTensorAccessorW> output_tensors) {
+      std::vector<GenericTensorAccessorW> output_tensors,
+      bool fwd_pass=true) {
     // Check if output directory exists, and create it if it does not
     char const *folder_path = "./inference_tensors";
     struct stat st = {0};
@@ -270,7 +271,7 @@ class Op {
     op_name_without_uid.erase(last_underscore);
     std::string base_filepath =
         "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
+        "_decoding-step_" + (fwd_pass ? std::to_string(m->decoding_step) : std::to_string(m->bwd_step)) + "_layer-num_" +
         std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
         op_name_without_uid + "_shard-id_" + std::to_string(shard_id);
     // save batch config, if passed
@@ -300,8 +301,8 @@ class Op {
         assert(false && "Tensor data type not supported");
       }
     }
-    // only dump the weights once
-    if (m->decoding_step == 0) {
+    // only dump the weights once (in fwd passes)
+    if (fwd_pass && m->decoding_step == 0) {
       for (int i = 0; i < weight_tensors.size(); i++) {
         std::string filename = base_filepath + "_weight_" + std::to_string(i);
         if (weight_tensors[i].data_type == DT_FLOAT) {
@@ -349,7 +350,11 @@ class Op {
       }
     }
     // increase count of decoding steps
-    m->decoding_step++;
+    if (fwd_pass) {
+      m->decoding_step++;
+    } else {
+      m->bwd_step++;
+    }
   }
   virtual bool measure_operator_cost(Simulator *sim,
                                      MachineView const &mv,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 4ccfe25a97..2ee4d4bc08 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1482,6 +1482,7 @@ OpMeta::OpMeta(FFHandler _handle)
     output_type[i] = DT_NONE;
   }
   decoding_step = 0;
+  bwd_step = 0;
 }
 #endif
 
@@ -1502,6 +1503,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op)
     output_type[i] = op->outputs[i]->data_type;
   }
   decoding_step = 0;
+  bwd_step = 0;
 }
 
 FFRuntime::FFRuntime(FFConfig &config) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b62172eac3..9e38235bbb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -601,6 +601,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
+        assert(new_request.req_type == Request::REQ_INFERENCE);
         pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
@@ -611,6 +612,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_sequence_length =
             new_request.max_sequence_length;
+        new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
         // add profile_info for the new request
         ProfileInfo profile_info;

From d86272c69c850c50a2b9bbf0fa0038ebf3720c83 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 13 Nov 2023 00:11:41 +0000
Subject: [PATCH 084/198] fix bug

---
 src/runtime/batch_config.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 20c0307a58..e37ab9aed3 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -25,7 +25,7 @@ LegionRuntime::Logger::Category log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_tokens(0) {
+BatchConfig::BatchConfig() : num_tokens(0), num_peft_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
     requestsInfo[i].first_token_depth_in_request = 0;
     requestsInfo[i].first_token_offset_in_batch = 0;

From 0a3258a932b9c069198182698f2b1fd420589bf9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 13 Nov 2023 01:17:58 +0000
Subject: [PATCH 085/198] implement save tensors for bwd

---
 include/flexflow/batch_config.h          |  2 +-
 include/flexflow/operator.h              | 13 ++--
 inference/incr_decoding/incr_decoding.cc | 60 ++++++++++-----
 src/ops/add_bias_residual_layer_norm.cc  | 20 ++++-
 src/ops/element_unary.cc                 |  2 +-
 src/ops/fused.cu                         |  2 +-
 src/ops/group_by.cc                      |  2 +-
 src/ops/inc_multihead_self_attention.cc  |  7 ++
 src/ops/linear.cc                        |  8 +-
 src/ops/lora_linear.cc                   | 93 ++++++++++++++++++++++++
 src/ops/residual_layer_norm.cc           | 25 ++++++-
 src/ops/residual_rms_norm.cc             | 14 +++-
 src/ops/rms_norm.cc                      |  8 +-
 src/ops/sigmoid_silu_multi.cc            | 12 +++
 src/ops/softmax.cc                       |  6 ++
 15 files changed, 241 insertions(+), 33 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index a592674b6e..8ddcec7d53 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -61,7 +61,7 @@ class BatchConfig {
   static int const MAX_NUM_TOKENS = 1024;
 
   //  Set by update
-  int num_tokens, num_peft_tokens;
+  int num_tokens = 0, num_peft_tokens = 0;
 
   struct PerRequestInfo {
     PerRequestInfo() {
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 9d54996bf0..af39412232 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -249,8 +249,8 @@ class Op {
       BatchConfig const *bc,
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
-      std::vector<GenericTensorAccessorW> output_tensors,
-      bool fwd_pass=true) {
+      std::vector<GenericTensorAccessorR> output_tensors,
+      bool fwd_pass = true) {
     // Check if output directory exists, and create it if it does not
     char const *folder_path = "./inference_tensors";
     struct stat st = {0};
@@ -271,9 +271,12 @@ class Op {
     op_name_without_uid.erase(last_underscore);
     std::string base_filepath =
         "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_decoding-step_" + (fwd_pass ? std::to_string(m->decoding_step) : std::to_string(m->bwd_step)) + "_layer-num_" +
-        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        op_name_without_uid + "_shard-id_" + std::to_string(shard_id);
+        (fwd_pass ? "_decoding-step_" : "_bwd-step_") +
+        (fwd_pass ? std::to_string(m->decoding_step)
+                  : std::to_string(m->bwd_step)) +
+        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
+        "_layer-name_" + op_name_without_uid + "_shard-id_" +
+        std::to_string(shard_id);
     // save batch config, if passed
     if (bc != nullptr) {
       bc->save_to_file(base_filepath + "_batch-config");
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index ed2b4705df..045f5de3c8 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -32,6 +32,7 @@ LegionRuntime::Logger::Category log_app("llama");
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
+  std::string peft_dataset_path;
   std::string output_file_path;
 };
 
@@ -74,6 +75,11 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
+    // PEFT dataset
+    if (!strcmp(argv[i], "-peft-dataset")) {
+      paths.peft_dataset_path = std::string(argv[++i]);
+      continue;
+    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -271,29 +277,47 @@ void FlexFlow::top_level_task(Task const *task,
                 mlp_second /*mlp_second*/);
 
   int total_num_requests = 0;
+  int total_dataset_entries = 0;
   {
     using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
+
     std::vector<std::string> prompts;
     std::vector<std::pair<std::string, std::string>> dataset;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      total_num_requests++;
-      prompts.push_back(text);
-      dataset.push_back(std::make_pair(text, text));
+
+    // Load prompts for inference
+    if (!file_paths.prompt_file_path.empty()) {
+      std::ifstream prompt_file_handle(file_paths.prompt_file_path);
+      assert(prompt_file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(prompt_file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      for (auto &prompt : prompt_json) {
+        std::string text = prompt.get<std::string>();
+        printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+        total_num_requests++;
+        prompts.push_back(text);
+      }
+    }
+    // Load HF dataset for PEFT training
+    if (!file_paths.peft_dataset_path.empty()) {
+      std::ifstream prompt_file_handle(file_paths.peft_dataset_path);
+      assert(prompt_file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(prompt_file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      for (auto &prompt : prompt_json) {
+        std::string text = prompt.get<std::string>();
+        printf("Training dataset entry [%d]: %s\n",
+               total_dataset_entries,
+               text.c_str());
+        total_dataset_entries++;
+        dataset.push_back(std::make_pair(text, text));
+        rm->register_new_peft_request(
+            dataset, 256 /*max_sequence_length*/, peft_model_id);
+      }
     }
-    rm->register_new_peft_request(
-        dataset, 256 /*max_sequence_length*/, peft_model_id);
-    //  for (auto &prompt : prompts) {
-    //    GenerationResult result = model.generate(prompt, 128
-    //    /*max_sequence_length*/);
-    //  }
     GenerationResult result =
         model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id);
   }
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 2ce2056050..82c71f517f 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -969,7 +969,7 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
     return;
   }
   assert(task->regions.size() == regions.size());
-  AddBiasResidualLayerNormMeta const *m =
+  AddBiasResidualLayerNormMeta *m =
       *((AddBiasResidualLayerNormMeta **)task->local_args);
   assert(regions.size() == 4 + m->elementwise_affine);
 
@@ -1017,6 +1017,24 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
   }
   AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
       m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    std::vector<GenericTensorAccessorR> weights_accessors;
+    weights_accessors.push_back(attn_bias_grad);
+    if (m->elementwise_affine) {
+      weights_accessors.push_back(gamma);
+    }
+    AddBiasResidualLayerNorm::save_inference_tensors_to_file(
+        m,
+        shard_id,
+        bc,
+        {input_grad, residual_grad},
+        weights_accessors,
+        {output_grad},
+        false /*fwd_pass*/);
+  }
 }
 
 bool AddBiasResidualLayerNorm::measure_operator_cost(
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index 844aeb6de3..c643da5625 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -557,7 +557,7 @@ void ElementUnary::forward_task_with_type(
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     std::vector<GenericTensorAccessorR> input_accessors;
-    std::vector<GenericTensorAccessorW> output_accessors;
+    std::vector<GenericTensorAccessorR> output_accessors;
     if (m->inplace) {
       GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
           m->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 1cb17ec20e..9954a8b43a 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -648,7 +648,7 @@ __host__ void
     if (metas->meta[op]->inference_debugging) {
       std::vector<GenericTensorAccessorR> input_accessors_to_save;
       std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-      std::vector<GenericTensorAccessorW> output_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
       for (int i = 0; i < fused->op_num_inputs[op]; i++) {
         int my_off = fused->op_input_idx[i + ioff];
         if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index 75960e7dcd..779d0d8f5d 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -396,7 +396,7 @@ void Group_by::forward_task(Task const *task,
   // Create a vector of n outputs, where n is the number of experts.
   // Each entry in the "outputs" vector points to the Legion tensor that will
   // contain the tockens dispatched to the corresponding expert
-  std::vector<GenericTensorAccessorW> output_accessors;
+  std::vector<GenericTensorAccessorR> output_accessors;
   float *outputs[n];
   for (int i = 0; i < n; i++) {
     GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index d2c1209ade..b66d524303 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -998,6 +998,13 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
       weight,
       output_grad,
       biases);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    IncMultiHeadSelfAttention::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
 }
 
 void IncMultiHeadSelfAttention::backward(FFModel const &ff) {
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 86f958a433..0887b6d35b 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -734,7 +734,7 @@ void Linear::peft_bwd_task(Task const *task,
                            Runtime *runtime) {
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  LinearMeta const *m = *((LinearMeta **)task->local_args);
+  LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_active_peft_tokens() == 0) {
     return;
@@ -765,6 +765,12 @@ void Linear::peft_bwd_task(Task const *task,
                           out_dim,
                           num_infr_tokens,
                           num_peft_tokens);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Linear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
 }
 
 void Linear::forward_task(Task const *task,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index eb14517fab..05edeab833 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -626,6 +626,99 @@ void LoraLinear::peft_bwd_task(Task const *task,
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
   peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+
+    // Check if output directory exists, and create it if it does not
+    char const *folder_path = "./inference_tensors";
+    struct stat st = {0};
+    if (stat(folder_path, &st) == -1) {
+      // Directory does not exist, create it
+      mkdir(folder_path, 0700);
+    }
+
+    std::string lora_layername = std::string(m->op_name);
+    std::string searchString = "lora";
+    size_t found = lora_layername.find(searchString);
+    if (found == std::string::npos) {
+      std::cout << "LoraLinear layer name not in the right format (does not "
+                   "contain word 'lora')"
+                << std::endl;
+      assert(false);
+    }
+    std::string lora_layername_substr =
+        lora_layername.substr(0, found + searchString.length());
+
+    // output base filepath, shared by all tensors from the same operator
+    std::string base_filepath =
+        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
+        "_bwd-step_" + std::to_string(m->bwd_step) + "_layer-num_" +
+        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
+        lora_layername_substr + "_shard-id_" + std::to_string(shard_id);
+
+    // save batch config, if passed
+    if (bc != nullptr) {
+      bc->save_to_file(base_filepath + "_batch-config");
+    }
+
+    std::string filename = base_filepath + "_input_" + std::to_string(0);
+    if (input_grad.data_type == DT_FLOAT) {
+      save_tensor(input_grad.get_float_ptr(),
+                  input_grad.domain.get_volume(),
+                  filename.c_str());
+    } else if (input_grad.data_type == DT_HALF) {
+      save_tensor(input_grad.get_half_ptr(),
+                  input_grad.domain.get_volume(),
+                  filename.c_str());
+    } else {
+      assert(false);
+    }
+
+    // std::cout << "base_filepath: " << base_filepath << std::endl;
+    // std::cout << "m->decoding_step: " << m->decoding_step << std::endl;
+    if (m->bwd_step == 0) {
+      for (auto it = m->model_weights.begin(); it != m->model_weights.end();
+           ++it) {
+        PEFTModelID peft_model_id = it->first;
+        LoraLinearWeight weight = m->model_weights[peft_model_id];
+        std::string filenameA = base_filepath + "_weight_A";
+        std::string filenameB = base_filepath + "_weight_B";
+        if (m->input_type[0] == DT_FLOAT) {
+          save_tensor((float *)weight.w0_grad_ptr,
+                      weight.rank * weight.in_dim,
+                      filenameA.c_str());
+          save_tensor((float *)weight.w1_grad_ptr,
+                      weight.rank * weight.out_dim,
+                      filenameB.c_str());
+        } else if (m->input_type[0] == DT_HALF) {
+          save_tensor((half *)weight.w0_grad_ptr,
+                      weight.rank * weight.in_dim,
+                      filenameA.c_str());
+          save_tensor((half *)weight.w1_grad_ptr,
+                      weight.rank * weight.out_dim,
+                      filenameB.c_str());
+        } else {
+          assert(false && "Data type not supported");
+        }
+      }
+    }
+
+    filename = base_filepath + "_output_" + std::to_string(0);
+    if (output_grad.data_type == DT_FLOAT) {
+      save_tensor(output_grad.get_float_ptr(),
+                  output_grad.domain.get_volume(),
+                  filename.c_str());
+    } else if (output_grad.data_type == DT_HALF) {
+      save_tensor(output_grad.get_half_ptr(),
+                  output_grad.domain.get_volume(),
+                  filename.c_str());
+    } else {
+      assert(false);
+    }
+    m->bwd_step++;
+  }
 }
 
 void LoraLinear::backward(FFModel const &ff) {
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index e3b599d10f..4bee47de6c 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -763,8 +763,7 @@ void ResidualLayerNorm::peft_bwd_task(
     return;
   }
   assert(task->regions.size() == regions.size());
-  ResidualLayerNormMeta const *m =
-      *((ResidualLayerNormMeta **)task->local_args);
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
   assert(regions.size() ==
          4 + m->use_two_residuals +
              (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
@@ -814,6 +813,28 @@ void ResidualLayerNorm::peft_bwd_task(
   }
   ResidualLayerNorm::peft_bwd_kernel_wrapper(
       m, output_grad, input_grad, residual1_grad, residual2_grad, gamma);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    std::vector<GenericTensorAccessorR> input_accessors;
+    input_accessors.push_back(input_grad);
+    input_accessors.push_back(residual1_grad);
+    if (m->use_two_residuals) {
+      input_accessors.push_back(residual2_grad);
+    }
+    std::vector<GenericTensorAccessorR> weights_accessors;
+    if (m->elementwise_affine) {
+      weights_accessors.push_back(gamma);
+    }
+    ResidualLayerNorm::save_inference_tensors_to_file(m,
+                                                      shard_id,
+                                                      bc,
+                                                      input_accessors,
+                                                      weights_accessors,
+                                                      {output_grad},
+                                                      false);
+  }
 }
 
 Op *ResidualLayerNorm::materialize(FFModel &ff,
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 8013c0e81a..a57b9248c7 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -670,7 +670,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
                                     Runtime *runtime) {
   assert(task->regions.size() == 4);
   assert(regions.size() == 4);
-  ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
+  ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_active_peft_tokens() == 0) {
     return;
@@ -695,6 +695,18 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(
       m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    ResidualRMSNorm::save_inference_tensors_to_file(
+        m,
+        shard_id,
+        bc,
+        {residual_input0_grad, residual_input1_grad},
+        {weight},
+        {output_grad},
+        false);
+  }
 }
 
 Op *ResidualRMSNorm::materialize(FFModel &ff,
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index e6df27d49a..5a8cfe8eff 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -572,7 +572,7 @@ void RMSNorm::peft_bwd_task(Task const *task,
                             Runtime *runtime) {
   assert(task->regions.size() == 3);
   assert(regions.size() == 3);
-  RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
+  RMSNormMeta *m = *((RMSNormMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_active_peft_tokens() == 0) {
     return;
@@ -584,6 +584,12 @@ void RMSNorm::peft_bwd_task(Task const *task,
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    RMSNorm::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
 }
 
 void RMSNorm::serialize(Legion::Serializer &sez) const {
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 14c202f784..d064bd0a1c 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -424,6 +424,18 @@ void SigmoidSiluMulti::peft_bwd_task(Task const *task,
 
   SigmoidSiluMulti::peft_bwd_kernel_wrapper(
       m, bc, output_grad, input1_grad, input2_grad);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    SigmoidSiluMulti::save_inference_tensors_to_file(m,
+                                                     shard_id,
+                                                     nullptr,
+                                                     {input1_grad, input2_grad},
+                                                     {},
+                                                     {output_grad},
+                                                     false);
+  }
 }
 
 FutureMap SigmoidSiluMulti::inference(
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index ae75849f85..88ffec3642 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -442,6 +442,12 @@ void Softmax::peft_bwd_task(Task const *task,
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Softmax::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
 }
 
 bool Softmax::get_int_parameter(PMParameter para, int *value) const {

From e34c40541e59dc8ff342aa9a228cf6ddb8938c22 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 15 Nov 2023 00:44:50 +0000
Subject: [PATCH 086/198] .

---
 inference/incr_decoding/incr_decoding.cc |  60 +++------
 tests/peft/hf_finetune.py                | 152 +++++++++++++++++++----
 2 files changed, 149 insertions(+), 63 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 045f5de3c8..c76637a62c 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -32,7 +32,6 @@ LegionRuntime::Logger::Category log_app("llama");
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
-  std::string peft_dataset_path;
   std::string output_file_path;
 };
 
@@ -75,11 +74,6 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
-    // PEFT dataset
-    if (!strcmp(argv[i], "-peft-dataset")) {
-      paths.peft_dataset_path = std::string(argv[++i]);
-      continue;
-    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -277,47 +271,29 @@ void FlexFlow::top_level_task(Task const *task,
                 mlp_second /*mlp_second*/);
 
   int total_num_requests = 0;
-  int total_dataset_entries = 0;
   {
     using json = nlohmann::json;
-
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
     std::vector<std::string> prompts;
     std::vector<std::pair<std::string, std::string>> dataset;
-
-    // Load prompts for inference
-    if (!file_paths.prompt_file_path.empty()) {
-      std::ifstream prompt_file_handle(file_paths.prompt_file_path);
-      assert(prompt_file_handle.good() && "Prompt file does not exist.");
-      json prompt_json = json::parse(prompt_file_handle,
-                                     /*parser_callback_t */ nullptr,
-                                     /*allow_exceptions */ true,
-                                     /*ignore_comments */ true);
-      for (auto &prompt : prompt_json) {
-        std::string text = prompt.get<std::string>();
-        printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-        total_num_requests++;
-        prompts.push_back(text);
-      }
-    }
-    // Load HF dataset for PEFT training
-    if (!file_paths.peft_dataset_path.empty()) {
-      std::ifstream prompt_file_handle(file_paths.peft_dataset_path);
-      assert(prompt_file_handle.good() && "Prompt file does not exist.");
-      json prompt_json = json::parse(prompt_file_handle,
-                                     /*parser_callback_t */ nullptr,
-                                     /*allow_exceptions */ true,
-                                     /*ignore_comments */ true);
-      for (auto &prompt : prompt_json) {
-        std::string text = prompt.get<std::string>();
-        printf("Training dataset entry [%d]: %s\n",
-               total_dataset_entries,
-               text.c_str());
-        total_dataset_entries++;
-        dataset.push_back(std::make_pair(text, text));
-        rm->register_new_peft_request(
-            dataset, 256 /*max_sequence_length*/, peft_model_id);
-      }
+    for (auto &prompt : prompt_json) {
+      std::string text = prompt.get<std::string>();
+      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      total_num_requests++;
+      //prompts.push_back(text);
+      dataset.push_back(std::make_pair(text, text));
     }
+    rm->register_new_peft_request(
+        dataset, 256 /*max_sequence_length*/, peft_model_id);
+    //  for (auto &prompt : prompts) {
+    //    GenerationResult result = model.generate(prompt, 128
+    //    /*max_sequence_length*/);
+    //  }
     GenerationResult result =
         model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id);
   }
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index cf157a8913..3fe01db283 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -1,11 +1,17 @@
-import os, sys
-#os.environ["CUDA_VISIBLE_DEVICES"]="0"
+import os, sys, shutil
 import torch
+# Reproducibility
+import random
+import numpy as np
+torch.manual_seed(0)
+random.seed(0)
+np.random.seed(0)
+#torch.use_deterministic_algorithms(True)
 import torch.nn as nn
 #import bitsandbytes as bnb
 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer
 import argparse
-from peft import LoraConfig, get_peft_model
+from peft import LoraConfig, get_peft_model, PeftModel
 import transformers
 from datasets import load_dataset
 
@@ -27,6 +33,75 @@ def print_trainable_parameters(model):
         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
     )
 
+def convert_hf_weight_name(name):
+    return (
+        name.replace(".", "_")
+        .replace("self_attn", "attention")
+        .replace("q_proj", "wq")
+        .replace("k_proj", "wk")
+        .replace("v_proj", "wv")
+        .replace("o_proj", "wo")
+        .replace("mlp", "feed_forward")
+        .replace("gate_proj", "w1")
+        .replace("down_proj", "w2")
+        .replace("up_proj", "w3")
+        .replace("input_layernorm", "attention_norm")
+        .replace("post_attention_layernorm", "ffn_norm")
+        .replace("embed_tokens", "tok_embeddings")
+        .replace("lm_head", "output")
+        .replace("model_", "")
+        .replace("base_", "")
+        .replace("default_", "")
+    )
+
+def peft_backward_hook(module, grad_input, grad_output):
+    if len(grad_input) == 0 or len(grad_output) == 0:
+        return
+    assert(module.name is not None and module.bwd_step is not None)
+    name = module.name.replace("base_model.model.model.", "")
+    print(f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}")
+    print("Backward GRAD Input:")
+    for i,gi in enumerate(grad_input):
+        if type(gi) == torch.Tensor:
+            print(gi.shape)
+            torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}")
+        else:
+            print(gi)
+    print("Backward GRAD Output:")
+    for i, go in enumerate(grad_output):
+        if type(go) == torch.Tensor:
+            print(go.shape)
+            torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}")
+        else:
+            print(go)
+    
+    print("===")
+    module.bwd_step += 1
+
+def peft_forward_hook(module, input, output):
+    if len(input) == 0 or len(output) == 0:
+        return
+    assert(module.name is not None and module.fwd_step is not None)
+    name = module.name.replace("base_model.model.model.", "")
+    print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}")
+    print("Input:")
+    for i,inp in enumerate(input):
+        if type(inp) == torch.Tensor:
+            print(inp.shape)
+            torch.save(inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}")
+        else:
+            print(inp)
+    print("Output:")
+    for i, out in enumerate(output):
+        if type(out) == torch.Tensor:
+            print(out.shape)
+            torch.save(out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}")
+        else:
+            print(out)
+    #print("Forward Input/Output: ", input[0].shape, output[0].shape)
+    print("===")
+    module.fwd_step += 1
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
@@ -37,6 +112,7 @@ def main():
     parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
     parser.add_argument("--output-dir", type=str, default="")
     parser.add_argument("--publish-peft-with-id", type=str, default="")
+    parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file")
     args = parser.parse_args()
     model_name = args.model_name
     use_full_precision=args.use_full_precision
@@ -46,8 +122,9 @@ def main():
     lora_dropout = args.lora_dropout
     output_dir = args.output_dir
     publish_peft_with_id = args.publish_peft_with_id
-    if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
-        raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model")
+    save_peft_tensors = args.save_peft_tensors
+    # if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
+    #     raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model")
 
     # Change working dir to folder storing this script
     abspath = os.path.abspath(__file__)
@@ -71,16 +148,18 @@ def main():
     if tokenizer.pad_token is None:
         tokenizer.pad_token = "[PAD]"
         tokenizer.padding_side = "left"
-
+    
+    peft_model_name = "goliaro/llama-2-7b-lora-full"
+    model = PeftModel.from_pretrained(model, peft_model_name)
+    
     for param in model.parameters():
         param.requires_grad = False  # freeze the model - train adapters later
         if param.ndim == 1:
             # cast the small parameters (e.g. layernorm) to fp32 for stability
             param.data = param.data.to(torch.float32)
 
-    model.gradient_checkpointing_enable()  # reduce number of stored activations
+    #model.gradient_checkpointing_enable()  # reduce number of stored activations
     model.enable_input_require_grads()
-
     model.lm_head = CastOutputToFloat(model.lm_head)
 
     config = LoraConfig(
@@ -89,26 +168,51 @@ def main():
         #target_modules=["q_proj", "v_proj"],
         #target_modules=["down_proj"],
         target_modules=lora_target_modules,
-        lora_dropout=lora_dropout,
+        lora_dropout=0.0,
         bias="none",
         task_type="CAUSAL_LM"
     )
+    model = get_peft_model(model, config)
+    
     print(model)
     print(model.named_parameters())
-    model = get_peft_model(model, config)
+    #model = get_peft_model(model, config)
     print_trainable_parameters(model)
 
-    data = load_dataset("Abirate/english_quotes")
+    if save_peft_tensors:
+        shutil.rmtree("./hf_peft_tensors", ignore_errors=True)
+        # Check that the output folder exists
+        os.makedirs("./hf_peft_tensors", exist_ok=True)
+        # Save hidden states and gradients
+        for name, layer in dict(model.named_modules()).items():
+            layer.name = name
+            layer.fwd_step = 0
+            layer.bwd_step = 0
+            print(f"Adding hooks to layer {layer.name}")
+            layer.register_forward_hook(peft_forward_hook)
+            layer.register_backward_hook(peft_backward_hook)
+        # Save weights
+        for name, params in model.named_parameters():
+            if "lora" in name:
+                torch.save(params, f"./hf_peft_tensors/{name}")
+                # Overwrite FF cached weight
+                dst_folder = f"/home/ubuntu/.cache/flexflow/weights/{peft_model_name}/full-precision"
+                assert(os.path.exists(dst_folder))
+                ff_w_name = convert_hf_weight_name(name)
+                print(f"{dst_folder}/{ff_w_name}")
+                params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}")
+
+    data = load_dataset("/home/ubuntu/english_quotes")
     data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
 
     trainer = transformers.Trainer(
         model=model,
         train_dataset=data['train'],
         args=transformers.TrainingArguments(
-            per_device_train_batch_size=4,
-            gradient_accumulation_steps=4,
-            warmup_steps=100,
-            max_steps=200,
+            per_device_train_batch_size=1,
+            gradient_accumulation_steps=1,
+            warmup_steps=0,
+            max_steps=1,
             learning_rate=2e-4,
             fp16=True if not use_full_precision else False,
             logging_steps=1,
@@ -117,15 +221,21 @@ def main():
         data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
     )
     model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+    
+    for batch in trainer.get_train_dataloader():
+        print("First batch: ")
+        print(batch)
+        break
+    
     trainer.train()
 
-    if len(output_dir) > 0:
-        print(f"Done fine-tuning! Saving the model to {output_dir}...")
-        model.save_pretrained(output_dir)
+    # if len(output_dir) > 0:
+    #     print(f"Done fine-tuning! Saving the model to {output_dir}...")
+    #     model.save_pretrained(output_dir)
     
-    if len(publish_peft_with_id) > 0:
-        print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...")
-        model.push_to_hub(publish_peft_with_id, use_auth_token=True)
+    # if len(publish_peft_with_id) > 0:
+    #     print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...")
+    #     model.push_to_hub(publish_peft_with_id, use_auth_token=True)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 87fbadae3e69e29607a4e3f768514ae961dc013b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 15 Nov 2023 02:48:54 +0000
Subject: [PATCH 087/198] bug fix

---
 tests/peft/hf_finetune.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 3fe01db283..067178808c 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -64,14 +64,14 @@ def peft_backward_hook(module, grad_input, grad_output):
     for i,gi in enumerate(grad_input):
         if type(gi) == torch.Tensor:
             print(gi.shape)
-            torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}")
+            torch.save(gi, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}")
         else:
             print(gi)
     print("Backward GRAD Output:")
     for i, go in enumerate(grad_output):
         if type(go) == torch.Tensor:
             print(go.shape)
-            torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}")
+            torch.save(go, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}")
         else:
             print(go)
     
@@ -201,6 +201,8 @@ def main():
                 ff_w_name = convert_hf_weight_name(name)
                 print(f"{dst_folder}/{ff_w_name}")
                 params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}")
+            if "lm_head" in name:
+                torch.save(params, f"./hf_peft_tensors/{name}")
 
     data = load_dataset("/home/ubuntu/english_quotes")
     data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

From 52759bdc1e127ef842fb2fbb7f78bc75bf5d8789 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 15 Nov 2023 03:54:49 +0000
Subject: [PATCH 088/198] fix

---
 tests/peft/hf_finetune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 067178808c..6dcb692f76 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -190,7 +190,7 @@ def main():
             layer.bwd_step = 0
             print(f"Adding hooks to layer {layer.name}")
             layer.register_forward_hook(peft_forward_hook)
-            layer.register_backward_hook(peft_backward_hook)
+            layer.register_full_backward_hook(peft_backward_hook)
         # Save weights
         for name, params in model.named_parameters():
             if "lora" in name:

From 2a5371da46ac9034ed6d4fe2dc360295f56f1567 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 15 Nov 2023 23:06:32 +0000
Subject: [PATCH 089/198] align linear

---
 src/ops/kernels/linear_kernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index e56c4124d6..0a2b5df06d 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -452,7 +452,6 @@ void peft_bwd_kernel(LinearMeta const *m,
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
 
-  DT alpha = 1.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
@@ -493,6 +492,7 @@ void peft_bwd_kernel(LinearMeta const *m,
 
   // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
+  DT alpha = 1.0f, beta = 0.0f;
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
@@ -507,7 +507,7 @@ void peft_bwd_kernel(LinearMeta const *m,
                            output_grad_ptr,
                            output_type,
                            out_dim,
-                           &alpha,
+                           &beta,
                            input_grad_ptr,
                            input_type,
                            in_dim,

From ed0be61ad14f3ae4292c888ca7b088660880d10d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 15 Nov 2023 22:27:22 -0500
Subject: [PATCH 090/198] fix

---
 python/flexflow/serve/serve.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 24cf9efb30..3349809670 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -550,7 +550,12 @@ def download_hf_config(self):
         print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
         print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
         with open(self.config_path, "w") as json_file:
-            json.dump(self.hf_config.to_dict(), json_file, indent=2)
+            class SetEncoder(json.JSONEncoder):
+                def default(self, obj):
+                    if isinstance(obj, set):
+                        return list(obj)
+                    return super().default(obj)
+            json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
 
     def __get_revision_hashes(self, peft_model_id: str):
         ff_revision = None

From 8a0b6ea7d7cb1ac38a2ed9cfdf610638a3352fa6 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 17 Nov 2023 07:20:27 +0000
Subject: [PATCH 091/198] bwd kernel updates

---
 src/ops/kernels/linear_kernels.cu            |  21 ++--
 src/ops/kernels/residual_rms_norm_kernels.cu | 100 ++++++++++---------
 2 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 0a2b5df06d..21629ec024 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -461,17 +461,18 @@ void peft_bwd_kernel(LinearMeta const *m,
       static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
   output_grad_ptr =
       static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = output_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(m->output_type[0],
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index de84e50e29..42a8747cbf 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -128,18 +128,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     X_out[index] = X1[index] + X2[index];
     sum +=
         (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -147,11 +142,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
+    Y[index] = static_cast<T>(static_cast<float>(X_out[index]) *
+                              static_cast<float>(rms[i]));
+    output[index] = static_cast<T>(static_cast<float>(Y[index]) *
+                                   static_cast<float>(weights[index % N]));
   }
 }
 
@@ -164,26 +160,17 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T *output_ptr,
                     cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   ResidualRMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input1_ptr,
-                                               input2_ptr,
-                                               residual_output_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+      <<<m->batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>(
+          m->in_dim,
+          m->eps,
+          input1_ptr,
+          input2_ptr,
+          residual_output_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr),
+          weight_ptr,
+          output_ptr);
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
@@ -345,16 +332,22 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
 template <typename T>
 __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
-  __shared__ T ds_storage[C10_WARP_SIZE];
+  __shared__ float ds_storage[C10_WARP_SIZE];
   const int64_t i = blockIdx.x;
-  T ds = 0;
+  float ds = 0;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int const index = i * N + j;
-    ds += dY[index] * X[index] * gamma[j];
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
   }
-  ds = BlockReduceSum<T>(ds, ds_storage);
+  ds = BlockReduceSum<float>(ds, ds_storage);
   if (threadIdx.x == 0) {
-    c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast<T>((int)N);
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
   }
 }
 
@@ -370,9 +363,14 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N,
   const int64_t i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    T dX_val = c1[i] * dY[index] * gamma[j] + c2[i] * X[index];
-    dX1[index] += dX_val;
-    dX2[index] += dX_val;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    // dX1[index] += dX_val;
+    // dX2[index] += dX_val;
+    dX1[index] = static_cast<T>(dX_val);
+    dX2[index] = static_cast<T>(dX_val);
   }
 }
 
@@ -452,12 +450,15 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m,
       continue;
     }
 
-    const int64_t M = bc->requestsInfo[i].num_tokens_in_batch;
-    const int64_t N = m->num_elements;
+    int M = m->batch_size; // TODO: replace with
+                           // m->requestsInfo[i].num_tokens_in_batch;
+    int N = m->in_dim;
+
     T const *residual_output_rms_input_ptr =
         static_cast<T *>(m->input_activation);
+
     ComputeInternalGradientsCUDAKernel<T>
-        <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
             N,
             output_grad_ptr,
             residual_output_rms_input_ptr,
@@ -466,14 +467,15 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m,
             static_cast<T *>(m->norm_ptr));
 
     RMSNormBackwardCUDAKernel<T>
-        <<<M, kCUDANumThreads, 0, stream>>>(N,
-                                            output_grad_ptr,
-                                            residual_output_rms_input_ptr,
-                                            weight_ptr,
-                                            static_cast<T *>(m->rms_ptr),
-                                            static_cast<T *>(m->norm_ptr),
-                                            residual_input0_grad_ptr,
-                                            residual_input1_grad_ptr);
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            m->in_dim,
+            output_grad_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            residual_input0_grad_ptr,
+            residual_input1_grad_ptr);
   }
 }
 

From b0e686d3273014cb2fec9f2eeea104a1c4c649fb Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 17 Nov 2023 22:18:20 +0000
Subject: [PATCH 092/198] undo use of CUBLAS_COMPUTE_32F_FAST_16F for now

---
 src/ops/inc_multihead_self_attention.cpp      | 26 ++++----
 src/ops/inc_multihead_self_attention.cu       | 63 ++++++++++---------
 src/ops/kernels/linear_kernels.cpp            | 39 ++++++------
 src/ops/kernels/linear_kernels.cu             | 47 +++++++-------
 src/ops/kernels/lora_linear_kernels.cu        | 42 +++++++------
 src/ops/spec_inc_multihead_self_attention.cpp | 13 ++--
 src/ops/spec_inc_multihead_self_attention.cu  | 21 ++++---
 src/ops/tree_inc_multihead_self_attention.cpp | 13 ++--
 src/ops/tree_inc_multihead_self_attention.cu  | 21 ++++---
 tests/peft/hf_finetune.py                     | 43 +++++++++----
 10 files changed, 182 insertions(+), 146 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 4495f66844..188659bea0 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -257,13 +257,14 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #else
+//   // TODO: currently use the hipblas_data_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
   // Input: qSize x num_tokens
@@ -510,13 +511,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #else
+//   // TODO: currently use the hipblas_data_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 92a1f37097..e597c7de97 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -238,17 +238,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
   // Input: qSize x num_tokens
@@ -466,17 +467,18 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -883,17 +885,18 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 2e7ae68314..4fa8ab244f 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -274,13 +274,14 @@ void forward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = output_type;
-#else
-  // TODO: currently use the output_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = output_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = output_type;
+// #else
+//   // TODO: currently use the output_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = output_type;
+// #endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
                           HIPBLAS_OP_N,
@@ -370,13 +371,14 @@ void peft_bwd_kernel(LinearMeta const *m,
   hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   // update input_grad_ptr offset
   input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens;
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = output_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #else
+//   // TODO: currently use the hipblas_data_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = output_type;
+// #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(m->output_type[0],
@@ -440,13 +442,14 @@ void backward_kernel(LinearMeta const *m,
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = output_type;
-#else
-  // TODO: currently use output_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   hipblasDatatype_t compute_type = output_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = output_type;
+// #else
+//   // TODO: currently use output_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = output_type;
+// #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 21629ec024..248e59bdeb 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -365,17 +365,18 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  cudaDataType_t compute_type = output_type;
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
                          CUBLAS_OP_N,
@@ -538,17 +539,19 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  cudaDataType_t compute_type = output_type;
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
+
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 2d271efe72..85a5d9990f 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -145,17 +145,18 @@ void inference_kernel(LoraLinearMeta *m,
   cudaDataType_t lr_actv_type = output_type;
   assert(input_type == output_type);
   cudaDataType_t weight_type = output_type;
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = output_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->input_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = output_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->input_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   int num_peft_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
@@ -267,17 +268,18 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   assert(input_type == output_type);
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = output_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = output_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 6252693d1a..d827a79c22 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -200,13 +200,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #else
+//   // TODO: currently use the hipblas_data_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index fb96862b81..999492f7c3 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -215,17 +215,18 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   // int num_requests = bc->num_active_requests();
   // int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 61117ce6df..d385880a74 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -157,13 +157,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #else
+//   // TODO: currently use the hipblas_data_type
+//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   hipblasDatatype_t compute_type = hipblas_data_type;
+// #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 8c2ee24132..fc3d1fda72 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -158,17 +158,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+//   cudaDataType_t compute_type = cublas_data_type;
+// #else
+//   // For best performance, set the default cublas compute type to
+//   // CUBLAS_COMPUTE_16F for half precision and to
+//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+//   if (m->output_type[0] == DT_FLOAT) {
+//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+//   }
+// #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 6dcb692f76..5650eff3e9 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -54,26 +54,41 @@ def convert_hf_weight_name(name):
         .replace("default_", "")
     )
 
+def pre_peft_backward_hook(module, grad_output):
+    assert (len(grad_output) == 1)
+    assert ("lm_head" in module.name)
+    name = module.name.replace("base_model.model.model.", "")
+    print(f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}")
+    print(grad_output[0].shape)
+    dev = grad_output[0].device
+    new_grad_output = torch.full(grad_output[0].shape, 0.5).to(dev)
+    assert(new_grad_output.shape == grad_output[0].shape)
+    return (new_grad_output,)
+
 def peft_backward_hook(module, grad_input, grad_output):
     if len(grad_input) == 0 or len(grad_output) == 0:
         return
     assert(module.name is not None and module.bwd_step is not None)
     name = module.name.replace("base_model.model.model.", "")
     print(f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}")
-    print("Backward GRAD Input:")
-    for i,gi in enumerate(grad_input):
-        if type(gi) == torch.Tensor:
-            print(gi.shape)
-            torch.save(gi, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}")
-        else:
-            print(gi)
     print("Backward GRAD Output:")
     for i, go in enumerate(grad_output):
         if type(go) == torch.Tensor:
-            print(go.shape)
-            torch.save(go, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}")
+            dst_filepath = f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}"
+            print("\t", go.shape)
+            print(f"\t\tSaving to {dst_filepath}")
+            torch.save(go, dst_filepath)
         else:
             print(go)
+    print("Backward GRAD Input:")
+    for i,gi in enumerate(grad_input):
+        if type(gi) == torch.Tensor:
+            dst_filepath = f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}"
+            print("\t", gi.shape)
+            print(f"\t\tSaving to {dst_filepath}")
+            torch.save(gi, dst_filepath)
+        else:
+            print(gi)
     
     print("===")
     module.bwd_step += 1
@@ -106,7 +121,7 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
     parser.add_argument("--lora-rank", type=int, default=16)
-    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument("--lora-alpha", type=int, default=16)
     parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target")
     parser.add_argument("--lora-dropout", type=float, default=0.05)
     parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
@@ -149,7 +164,8 @@ def main():
         tokenizer.pad_token = "[PAD]"
         tokenizer.padding_side = "left"
     
-    peft_model_name = "goliaro/llama-2-7b-lora-full"
+    #peft_model_name = "goliaro/llama-2-7b-lora-full"
+    peft_model_name = "goliaro/llama-160m-lora-full"
     model = PeftModel.from_pretrained(model, peft_model_name)
     
     for param in model.parameters():
@@ -191,6 +207,9 @@ def main():
             print(f"Adding hooks to layer {layer.name}")
             layer.register_forward_hook(peft_forward_hook)
             layer.register_full_backward_hook(peft_backward_hook)
+            # base_model.model.base_model.model.lm_head
+            if "lm_head" in name:
+                layer.register_full_backward_pre_hook(pre_peft_backward_hook)
         # Save weights
         for name, params in model.named_parameters():
             if "lora" in name:
@@ -201,7 +220,7 @@ def main():
                 ff_w_name = convert_hf_weight_name(name)
                 print(f"{dst_folder}/{ff_w_name}")
                 params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}")
-            if "lm_head" in name:
+            if "lm_head" in name or "norm" in name:
                 torch.save(params, f"./hf_peft_tensors/{name}")
 
     data = load_dataset("/home/ubuntu/english_quotes")

From 0daf2329303402c66a0f2967879a34738aeb5b30 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 19 Nov 2023 20:15:04 +0000
Subject: [PATCH 093/198] only send dataset entry once

---
 inference/incr_decoding/incr_decoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c76637a62c..5375acb355 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -286,7 +286,7 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
       //prompts.push_back(text);
-      dataset.push_back(std::make_pair(text, text));
+      dataset.push_back(std::make_pair(text, ""));
     }
     rm->register_new_peft_request(
         dataset, 256 /*max_sequence_length*/, peft_model_id);

From ec131c71d5bbe37eaeb464efd2c2b0a52ed7f7c8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 20 Nov 2023 13:15:45 -0500
Subject: [PATCH 094/198] update peft test scripts

---
 tests/peft/fine_tune.sh   |  20 ++--
 tests/peft/hf_finetune.py | 236 +++++++++++++++++++++-----------------
 tests/peft/hf_serve.py    |  45 ++++++--
 tests/peft/hf_train.py    | 161 ++++++++++++++++++++++++++
 4 files changed, 334 insertions(+), 128 deletions(-)
 create mode 100644 tests/peft/hf_train.py

diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh
index eddb6139d0..309d87130a 100755
--- a/tests/peft/fine_tune.sh
+++ b/tests/peft/fine_tune.sh
@@ -5,15 +5,15 @@ set -x
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
-python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full
-python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half
-python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
-python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
+python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full
+python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half
+python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
+python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
 
-python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full
-python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half
+python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full
+python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half
 
-python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full
-python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half
-python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full
-python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half
+python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full
+python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half
+python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full
+python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 5650eff3e9..7836633b30 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -1,24 +1,34 @@
 import os, sys, shutil
 import torch
+
 # Reproducibility
 import random
 import numpy as np
+
 torch.manual_seed(0)
 random.seed(0)
 np.random.seed(0)
-#torch.use_deterministic_algorithms(True)
+# torch.use_deterministic_algorithms(True)
 import torch.nn as nn
-#import bitsandbytes as bnb
+
+# import bitsandbytes as bnb
 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer
 import argparse
-from peft import LoraConfig, get_peft_model, PeftModel
+from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
 import transformers
-from datasets import load_dataset
+
+if transformers.__version__ < "4.31.0":
+    raise RuntimeError(
+        "Please update the transformers library version to 4.31.0 or above"
+    )
+from datasets import load_dataset, DatasetDict
+
 
 class CastOutputToFloat(nn.Sequential):
-    def forward(self, x): 
+    def forward(self, x):
         return super().forward(x).to(torch.float32)
 
+
 def print_trainable_parameters(model):
     """
     Prints the number of trainable parameters in the model.
@@ -33,42 +43,26 @@ def print_trainable_parameters(model):
         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
     )
 
-def convert_hf_weight_name(name):
-    return (
-        name.replace(".", "_")
-        .replace("self_attn", "attention")
-        .replace("q_proj", "wq")
-        .replace("k_proj", "wk")
-        .replace("v_proj", "wv")
-        .replace("o_proj", "wo")
-        .replace("mlp", "feed_forward")
-        .replace("gate_proj", "w1")
-        .replace("down_proj", "w2")
-        .replace("up_proj", "w3")
-        .replace("input_layernorm", "attention_norm")
-        .replace("post_attention_layernorm", "ffn_norm")
-        .replace("embed_tokens", "tok_embeddings")
-        .replace("lm_head", "output")
-        .replace("model_", "")
-        .replace("base_", "")
-        .replace("default_", "")
-    )
 
-def pre_peft_backward_hook(module, grad_output):
-    assert (len(grad_output) == 1)
-    assert ("lm_head" in module.name)
+def lm_head_pre_backward_hook(module, grad_output):
+    # Fill grad input tensor with 0.5 to align other layers without having to align loss
+    assert len(grad_output) == 1
+    assert "lm_head" in module.name
     name = module.name.replace("base_model.model.model.", "")
-    print(f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}")
+    print(
+        f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}"
+    )
     print(grad_output[0].shape)
     dev = grad_output[0].device
     new_grad_output = torch.full(grad_output[0].shape, 0.5).to(dev)
-    assert(new_grad_output.shape == grad_output[0].shape)
+    assert new_grad_output.shape == grad_output[0].shape
     return (new_grad_output,)
 
+
 def peft_backward_hook(module, grad_input, grad_output):
     if len(grad_input) == 0 or len(grad_output) == 0:
         return
-    assert(module.name is not None and module.bwd_step is not None)
+    assert module.name is not None and module.bwd_step is not None
     name = module.name.replace("base_model.model.model.", "")
     print(f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}")
     print("Backward GRAD Output:")
@@ -81,7 +75,7 @@ def peft_backward_hook(module, grad_input, grad_output):
         else:
             print(go)
     print("Backward GRAD Input:")
-    for i,gi in enumerate(grad_input):
+    for i, gi in enumerate(grad_input):
         if type(gi) == torch.Tensor:
             dst_filepath = f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}"
             print("\t", gi.shape)
@@ -89,110 +83,125 @@ def peft_backward_hook(module, grad_input, grad_output):
             torch.save(gi, dst_filepath)
         else:
             print(gi)
-    
+
     print("===")
     module.bwd_step += 1
 
+
 def peft_forward_hook(module, input, output):
     if len(input) == 0 or len(output) == 0:
         return
-    assert(module.name is not None and module.fwd_step is not None)
+    assert module.name is not None and module.fwd_step is not None
     name = module.name.replace("base_model.model.model.", "")
     print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}")
     print("Input:")
-    for i,inp in enumerate(input):
+    for i, inp in enumerate(input):
         if type(inp) == torch.Tensor:
             print(inp.shape)
-            torch.save(inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}")
+            torch.save(
+                inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}"
+            )
         else:
             print(inp)
     print("Output:")
     for i, out in enumerate(output):
         if type(out) == torch.Tensor:
             print(out.shape)
-            torch.save(out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}")
+            torch.save(
+                out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}"
+            )
         else:
             print(out)
-    #print("Forward Input/Output: ", input[0].shape, output[0].shape)
+    # print("Forward Input/Output: ", input[0].shape, output[0].shape)
     print("===")
     module.fwd_step += 1
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
-    parser.add_argument("--lora-rank", type=int, default=16)
+    parser.add_argument(
+        "--peft-model-id", type=str, default="goliaro/llama-160m-lora-full"
+    )
     parser.add_argument("--lora-alpha", type=int, default=16)
-    parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target")
-    parser.add_argument("--lora-dropout", type=float, default=0.05)
-    parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
+    parser.add_argument("--lora-dropout", type=float, default=0.0)
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
     parser.add_argument("--output-dir", type=str, default="")
     parser.add_argument("--publish-peft-with-id", type=str, default="")
-    parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file")
+    parser.add_argument(
+        "--save-peft-tensors",
+        action="store_true",
+        help="Save PEFT hidden states and weights to file",
+    )
     args = parser.parse_args()
-    model_name = args.model_name
-    use_full_precision=args.use_full_precision
-    lora_rank = args.lora_rank
+    peft_model_id = args.peft_model_id
+    use_full_precision = args.use_full_precision
     lora_alpha = args.lora_alpha
-    lora_target_modules = args.lora_target_modules.split(",")
     lora_dropout = args.lora_dropout
     output_dir = args.output_dir
     publish_peft_with_id = args.publish_peft_with_id
     save_peft_tensors = args.save_peft_tensors
-    # if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
-    #     raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model")
 
     # Change working dir to folder storing this script
     abspath = os.path.abspath(__file__)
     dname = os.path.dirname(abspath)
     os.chdir(dname)
 
+    # Get PEFT layer, edit any configs as needed
+    peft_config = PeftConfig.from_pretrained(peft_model_id)
+    if peft_config.peft_type != "LORA":
+        raise ValueError(f"PEFT type {peft_config.peft_type} not supported yet")
+    peft_config.lora_alpha = lora_alpha
+    peft_config.lora_dropout = lora_dropout
+    peft_config.init_lora_weights = (
+        False
+    )  # prevent HF from re-inizialing the weights randomly
+    model_name = peft_config.base_model_name_or_path
+    # Load base model, and apply the PEFT layer
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        #load_in_8bit=True,
-        torch_dtype = torch.float32 if use_full_precision else torch.float16,
-        device_map='auto',
+        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        device_map="auto",
     )
+    model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)
 
     # Get Tokenizer
     hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     hf_arch = getattr(hf_config, "architectures")[0]
     if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
-        tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=True, torch_dtype = torch.float32 if use_full_precision else torch.float16,)
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_name,
+            use_fast=True,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
     else:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype = torch.float32 if use_full_precision else torch.float16,)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = "[PAD]"
         tokenizer.padding_side = "left"
-    
-    #peft_model_name = "goliaro/llama-2-7b-lora-full"
-    peft_model_name = "goliaro/llama-160m-lora-full"
-    model = PeftModel.from_pretrained(model, peft_model_name)
-    
-    for param in model.parameters():
-        param.requires_grad = False  # freeze the model - train adapters later
+
+    # Freeze all layers except the LORA ones. Cast small layers to full precision for stability
+    for name, param in model.named_parameters():
+        if "lora" not in name:
+            param.requires_grad = False  # freeze the model - train adapters later
+        else:
+            param.requires_grad = True
         if param.ndim == 1:
             # cast the small parameters (e.g. layernorm) to fp32 for stability
             param.data = param.data.to(torch.float32)
-
-    #model.gradient_checkpointing_enable()  # reduce number of stored activations
+    if not save_peft_tensors:
+        model.gradient_checkpointing_enable()  # reduce number of stored activations
     model.enable_input_require_grads()
     model.lm_head = CastOutputToFloat(model.lm_head)
 
-    config = LoraConfig(
-        r=lora_rank,
-        lora_alpha=lora_alpha,
-        #target_modules=["q_proj", "v_proj"],
-        #target_modules=["down_proj"],
-        target_modules=lora_target_modules,
-        lora_dropout=0.0,
-        bias="none",
-        task_type="CAUSAL_LM"
-    )
-    model = get_peft_model(model, config)
-    
+    # Print model with PEFT
     print(model)
-    print(model.named_parameters())
-    #model = get_peft_model(model, config)
+    for name, params in model.named_parameters():
+        print(name)
     print_trainable_parameters(model)
 
     if save_peft_tensors:
@@ -207,28 +216,34 @@ def main():
             print(f"Adding hooks to layer {layer.name}")
             layer.register_forward_hook(peft_forward_hook)
             layer.register_full_backward_hook(peft_backward_hook)
-            # base_model.model.base_model.model.lm_head
+            # TODO: remove hard-coding of lm head grad input after aligning the loss
             if "lm_head" in name:
-                layer.register_full_backward_pre_hook(pre_peft_backward_hook)
-        # Save weights
+                layer.register_full_backward_pre_hook(lm_head_pre_backward_hook)
+        # Save any weights of interest
         for name, params in model.named_parameters():
             if "lora" in name:
                 torch.save(params, f"./hf_peft_tensors/{name}")
-                # Overwrite FF cached weight
-                dst_folder = f"/home/ubuntu/.cache/flexflow/weights/{peft_model_name}/full-precision"
-                assert(os.path.exists(dst_folder))
-                ff_w_name = convert_hf_weight_name(name)
-                print(f"{dst_folder}/{ff_w_name}")
-                params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}")
             if "lm_head" in name or "norm" in name:
                 torch.save(params, f"./hf_peft_tensors/{name}")
 
-    data = load_dataset("/home/ubuntu/english_quotes")
-    data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
+    # Load fine-tuning dataset
+    data = load_dataset("Abirate/english_quotes")
+
+    # TODO: remove using of a single row
+    key_to_filter = "quote"
+    desired_value = "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”"
+    filtered_dataset_dict = DatasetDict()
+    for split, dataset in data.items():
+        filtered_dataset = dataset.filter(
+            lambda example: example[key_to_filter] == desired_value
+        )
+        filtered_dataset_dict[split] = filtered_dataset
+    data = filtered_dataset_dict
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 
     trainer = transformers.Trainer(
         model=model,
-        train_dataset=data['train'],
+        train_dataset=data["train"],
         args=transformers.TrainingArguments(
             per_device_train_batch_size=1,
             gradient_accumulation_steps=1,
@@ -237,26 +252,33 @@ def main():
             learning_rate=2e-4,
             fp16=True if not use_full_precision else False,
             logging_steps=1,
-            output_dir=os.path.join(output_dir if len(output_dir) > 0 else "./", "lora_training_logs"),
+            output_dir=os.path.join(
+                output_dir if len(output_dir) > 0 else "./", "lora_training_logs"
+            ),
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
         ),
-        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
     )
-    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
-    
-    for batch in trainer.get_train_dataloader():
-        print("First batch: ")
-        print(batch)
-        break
-    
+    model.config.use_cache = (
+        False
+    )  # silence the warnings. Please re-enable for inference!
+
+    # for batch in trainer.get_train_dataloader():
+    #     print("First batch: ")
+    #     print(batch)
+    #     break
+
     trainer.train()
 
-    # if len(output_dir) > 0:
-    #     print(f"Done fine-tuning! Saving the model to {output_dir}...")
-    #     model.save_pretrained(output_dir)
-    
-    # if len(publish_peft_with_id) > 0:
-    #     print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...")
-    #     model.push_to_hub(publish_peft_with_id, use_auth_token=True)
+    if len(output_dir) > 0:
+        print(f"Saving the model to {output_dir}...")
+        model.save_pretrained(output_dir)
+
+    if len(publish_peft_with_id) > 0:
+        print(f"Uploading the model to HF hub with id: {publish_peft_with_id}...")
+        model.push_to_hub(publish_peft_with_id, use_auth_token=True)
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 6e143550c8..ad1f903cfb 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -10,20 +10,30 @@
     GenerationConfig,
 )
 
+
 def peft_pre_forward_hook(module, input):
-    assert(module.name is not None and module.decoding_step is not None)
+    assert module.name is not None and module.decoding_step is not None
     name = module.name.replace("base_model.model.model.", "")
-    print(f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}")
+    print(
+        f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}"
+    )
     print("Pre-Input: ", input[0].shape)
-    torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input")
-    #print("===")
+    torch.save(
+        input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input"
+    )
+    # print("===")
+
 
 def peft_post_forward_hook(module, input, output):
-    assert(module.name is not None and module.decoding_step is not None)
+    assert module.name is not None and module.decoding_step is not None
     name = module.name.replace("base_model.model.model.", "")
-    print(f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}")
+    print(
+        f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}"
+    )
     print("Post-Input/Output: ", input[0].shape, output[0].shape)
-    torch.save(output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output")
+    torch.save(
+        output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output"
+    )
     print("===")
     module.decoding_step += 1
 
@@ -36,7 +46,11 @@ def main():
     )
     parser.add_argument("--max-new-tokens", type=int, default=50)
     parser.add_argument("--do-sample", action="store_true", help="Use sampling")
-    parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file")
+    parser.add_argument(
+        "--save-peft-tensors",
+        action="store_true",
+        help="Save PEFT hidden states and weights to file",
+    )
     args = parser.parse_args()
     peft_model_id = args.peft_model_id
     use_full_precision = args.use_full_precision
@@ -76,7 +90,17 @@ def main():
     generation_config.do_sample = args.do_sample
     # Load the Lora model
     model = PeftModel.from_pretrained(model, peft_model_id)
-    
+
+    print(model)
+    for name, params in model.named_parameters():
+        print(name)
+        if (
+            name
+            == "base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight"
+        ):
+            print(params)
+    assert False
+
     # Register hooks to save tensors, if needed
     if save_peft_tensors:
         shutil.rmtree("./hf_peft_tensors")
@@ -86,7 +110,7 @@ def main():
         for name, params in model.named_parameters():
             if "lora" in name:
                 torch.save(params, f"./hf_peft_tensors/{name}")
-                #params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+                # params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
         # Save hidden states
         for name, layer in dict(model.named_modules()).items():
             if "lora_A.default" in name or "lora_B.default" in name:
@@ -96,7 +120,6 @@ def main():
                 layer.register_forward_pre_hook(peft_pre_forward_hook)
                 layer.register_forward_hook(peft_post_forward_hook)
 
-    
     batch = tokenizer("Two things are infinite: ", return_tensors="pt")
     with torch.cuda.amp.autocast():
         output_tokens = model.generate(
diff --git a/tests/peft/hf_train.py b/tests/peft/hf_train.py
new file mode 100644
index 0000000000..707fc9d0ae
--- /dev/null
+++ b/tests/peft/hf_train.py
@@ -0,0 +1,161 @@
+import os, sys
+
+# os.environ["CUDA_VISIBLE_DEVICES"]="0"
+import torch
+import torch.nn as nn
+
+# import bitsandbytes as bnb
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer
+import argparse
+from peft import LoraConfig, get_peft_model
+import transformers
+from datasets import load_dataset
+
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x):
+        return super().forward(x).to(torch.float32)
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
+    parser.add_argument("--lora-rank", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument(
+        "--lora-target-modules",
+        type=str,
+        default="down_proj",
+        help="Comma-separated list of layers from the base model to target",
+    )
+    parser.add_argument("--lora-dropout", type=float, default=0.05)
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--publish-peft-with-id", type=str, default="")
+    args = parser.parse_args()
+    model_name = args.model_name
+    use_full_precision = args.use_full_precision
+    lora_rank = args.lora_rank
+    lora_alpha = args.lora_alpha
+    lora_target_modules = args.lora_target_modules.split(",")
+    lora_dropout = args.lora_dropout
+    output_dir = args.output_dir
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
+        raise ValueError(
+            "Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the trained model"
+        )
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        device_map="auto",
+    )
+
+    # Get Tokenizer
+    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_name,
+            use_fast=True,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+
+    for param in model.parameters():
+        param.requires_grad = False  # freeze the model - train adapters later
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float32)
+
+    model.gradient_checkpointing_enable()  # reduce number of stored activations
+    model.enable_input_require_grads()
+
+    model.lm_head = CastOutputToFloat(model.lm_head)
+
+    config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_alpha,
+        # target_modules=["q_proj", "v_proj"],
+        # target_modules=["down_proj"],
+        target_modules=lora_target_modules,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    print(model)
+    print(model.named_parameters())
+    model = get_peft_model(model, config)
+    print_trainable_parameters(model)
+
+    data = load_dataset("Abirate/english_quotes")
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=data["train"],
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=4,
+            gradient_accumulation_steps=4,
+            warmup_steps=100,
+            max_steps=200,
+            learning_rate=2e-4,
+            fp16=True if not use_full_precision else False,
+            logging_steps=1,
+            output_dir=os.path.join(
+                output_dir if len(output_dir) > 0 else "./", "lora_training_logs"
+            ),
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+    )
+    model.config.use_cache = (
+        False
+    )  # silence the warnings. Please re-enable for inference!
+    trainer.train()
+
+    if len(output_dir) > 0:
+        print(f"Done training! Saving the model to {output_dir}...")
+        model.save_pretrained(output_dir)
+
+    if len(publish_peft_with_id) > 0:
+        print(
+            f"Done training! Uploading the model to HF hub with id: {publish_peft_with_id}..."
+        )
+        model.push_to_hub(publish_peft_with_id, use_auth_token=True)
+
+
+if __name__ == "__main__":
+    main()

From 0431c739970a5ebda5bc592f3b8b62eb5ee141e6 Mon Sep 17 00:00:00 2001
From: xinhaoc <chengxh_98@163.com>
Date: Mon, 20 Nov 2023 20:43:58 +0000
Subject: [PATCH 095/198] loss

---
 include/flexflow/batch_config.h          |  3 ++-
 include/flexflow/utils/cuda_helper.h     |  3 ++-
 inference/incr_decoding/incr_decoding.cc |  2 +-
 src/loss_functions/loss_functions.cu     |  8 ++++----
 src/ops/kernels/softmax.cu               | 21 ++++++++++++++++++++-
 src/runtime/cuda_helper.cu               | 10 +++++++++-
 src/runtime/request_manager.cc           | 13 ++++++-------
 7 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 8ddcec7d53..492502ac50 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -61,7 +61,7 @@ class BatchConfig {
   static int const MAX_NUM_TOKENS = 1024;
 
   //  Set by update
-  int num_tokens = 0, num_peft_tokens = 0;
+  int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
 
   struct PerRequestInfo {
     PerRequestInfo() {
@@ -89,6 +89,7 @@ class BatchConfig {
   };
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+  PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
   bool request_running[MAX_NUM_REQUESTS];
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index 999bc27634..f6a393a9ff 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -75,8 +75,9 @@ inline int GET_BLOCKS(int const N) {
   return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret;
 }
 
+template <typename DT>
 __global__ void
-    scale_kernel(float *ptr, Legion::coord_t size, float a, float b);
+    scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
 
 __global__ void ones_kernel(float *ptr, Legion::coord_t size);
 
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 5375acb355..c76637a62c 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -286,7 +286,7 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
       //prompts.push_back(text);
-      dataset.push_back(std::make_pair(text, ""));
+      dataset.push_back(std::make_pair(text, text));
     }
     rm->register_new_peft_request(
         dataset, 256 /*max_sequence_length*/, peft_model_id);
diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu
index f78311980c..636ef9c4c3 100644
--- a/src/loss_functions/loss_functions.cu
+++ b/src/loss_functions/loss_functions.cu
@@ -81,7 +81,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper(
       logit_grad_ptr, label_ptr, num_samples, num_classes, k);
   // Scale logit gradients by op->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor * k);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k);
 }
 
 void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
@@ -100,7 +100,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
       logit_grad_ptr, logit_ptr, label_ptr, logit_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor);
 }
 
 void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
@@ -119,7 +119,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
       logit_grad_ptr, logit_ptr, label_ptr, logit_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor);
 }
 
 void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
@@ -135,7 +135,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
                            stream>>>(loss_grad_ptr, loss_ptr, loss_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(loss_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      loss_grad_ptr, loss_grad_volume, 0, scale_factor);
+      loss_grad_ptr, loss_grad_volume, 0.0f, scale_factor);
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 96d50e1ca4..0fc827319d 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -306,8 +306,18 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
     }
     int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     for (int j = 0; j < num_bwd_tokens; j++) {
-      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests].token_id;
+      token_ids[j] = bc->labelsInfo[j + tokens_previous_requests].token_id;
     }
+
+    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+    // ignore last token
+    checkCUDA(cudaMemsetAsync(
+        input_grad_ptr + (tokens_previous_requests +
+                          bc->requestsInfo[i].num_tokens_in_batch - 1) *
+                             num_classes,
+        0,
+        num_classes * sizeof(DT),
+        stream));
     checkCUDA(cudaMemcpyAsync(m->handle.workSpace,
                               token_ids,
                               sizeof(BatchConfig::TokenId) * num_bwd_tokens,
@@ -323,6 +333,15 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
         static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
         num_bwd_tokens,
         num_classes);
+    // scale
+    scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
+                   CUDA_NUM_THREADS,
+                   0,
+                   stream>>>(input_grad_ptr +
+                                 tokens_previous_requests * num_classes,
+                             num_bwd_tokens * num_classes,
+                             DT(0.0),
+                             scale_factor);
 
     tokens_previous_requests += num_bwd_tokens;
   }
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 0de6d9bc63..935404ad42 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -36,7 +36,8 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 
 using FlexFlow::get_legion_stream;
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
@@ -644,6 +645,13 @@ template __global__ void
 template __global__ void
     assign_kernel<int64_t>(int64_t *ptr, coord_t size, int64_t value);
 
+template __global__ void
+    scale_kernel<half>(half *ptr, coord_t size, half a, half b);
+template __global__ void
+    scale_kernel<float>(float *ptr, coord_t size, float a, float b);
+template __global__ void
+    scale_kernel<double>(double *ptr, coord_t size, double a, double b);
+
 template __global__ void
     add_kernel<half>(half *dst, half const *src, size_t size);
 template __global__ void
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9e38235bbb..3a520213f5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -660,8 +660,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     assert(request.status != Request::COMPLETED);
     assert(request.max_training_steps > 0 &&
            request.completed_training_steps < request.max_training_steps);
-    int num_peft_tokens =
-        request.dataset[0].first.size() + request.dataset[0].second.size();
+    int num_peft_tokens = request.dataset[0].first.size();
+    int num_peft_label_tokens = request.dataset[0].second.size();
     if (num_peft_tokens + new_bc.num_active_tokens() <=
         get_max_tokens_per_batch()) {
       // The last request slot is reserved for PEFT request
@@ -686,13 +686,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.num_peft_tokens++;
       }
       for (size_t i = 0; i < request.dataset[0].second.size(); i++) {
-        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+        new_bc.labelsInfo[new_bc.num_peft_label_tokens].token_id =
             request.dataset[0].second[i];
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx;
+        new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index = peft_req_idx;
         int depth = request.dataset[0].first.size() + i;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-        new_bc.num_tokens++;
-        new_bc.num_peft_tokens++;
+        new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request = depth;
+        new_bc.num_peft_label_tokens++;
       }
     }
   }

From 371dffdf06dc0ca62a464f890c0cf80cfc88c33d Mon Sep 17 00:00:00 2001
From: xinhaoc <chengxh_98@163.com>
Date: Mon, 20 Nov 2023 20:45:59 +0000
Subject: [PATCH 096/198] .

---
 inference/incr_decoding/incr_decoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c76637a62c..2313eca385 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -285,7 +285,7 @@ void FlexFlow::top_level_task(Task const *task,
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      //prompts.push_back(text);
+      // prompts.push_back(text);
       dataset.push_back(std::make_pair(text, text));
     }
     rm->register_new_peft_request(

From da690ff1c0c8ccccd67cad995948bca8ff5667bb Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 20 Nov 2023 22:13:47 -0500
Subject: [PATCH 097/198] update generate/request api to take both inference
 and fine-tuning prompts

---
 include/flexflow/model.h                 |  10 +-
 include/flexflow/request_manager.h       |  31 ++---
 include/flexflow/utils/cuda_helper.h     |   3 +-
 inference/incr_decoding/incr_decoding.cc |  29 +++--
 inference/spec_infer/spec_infer.cc       |  11 +-
 src/c/flexflow_c.cc                      |  12 +-
 src/runtime/request_manager.cc           | 140 +++++++----------------
 7 files changed, 88 insertions(+), 148 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index b4d2fe53af..7232cb3f0b 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -367,6 +367,8 @@ class AllReduce;
 class FusedParallelOp;
 class ParallelOpInfo;
 
+struct Request;
+
 // TODO: Move to an appropriate place
 /*
   This is used to create a type that recursively replaces value type
@@ -830,13 +832,9 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  GenerationResult generate(std::string const &prompts,
-                            int max_seq_length,
-                            PEFTModelID peft_model_id = PEFTModelID::NO_ID);
+  GenerationResult generate(Request const &request);
 
-  GenerationResult generate(std::vector<std::string> const &prompts,
-                            int max_seq_length,
-                            PEFTModelID peft_model_id = PEFTModelID::NO_ID);
+  GenerationResult generate(std::vector<Request> const &request);
 
   PEFTModelID register_peft_model(
       LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 0aa654f9e7..8e7a829627 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -59,19 +59,21 @@ struct Request {
   };
   enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 };
   BatchConfig::RequestGuid guid;
-  PEFTModelID peft_model_id;
-  int max_sequence_length;
+  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
+  int max_sequence_length = 128;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
+  std::string prompt;
   std::vector<struct BeamTree> beam_trees;
   // PEFT field
   RequestType req_type = REQ_INFERENCE;
   int completed_training_steps = 0;
   int max_training_steps = 1;
+  std::vector<std::pair<std::string, std::string>> dataset_text;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
       dataset;
@@ -119,26 +121,13 @@ class RequestManager {
 
   FFModel *get_model(int model_id);
 
-  GenerationResult
-      generate_incr_decoding(FFModel *model,
-                             std::vector<std::string> const &prompts,
-                             int max_seq_length,
-                             PEFTModelID peft_model_id);
-  GenerationResult generate_spec_infer(FFModel *model,
-                                       std::vector<std::string> const &prompts,
-                                       int max_seq_length,
-                                       PEFTModelID peft_model_id);
+  GenerationResult generate_incr_decoding(FFModel *llm,
+                                          std::vector<Request> const &requests);
+  GenerationResult generate_spec_infer(FFModel *llm,
+                                       std::vector<Request> const &requests);
   GenerationResult get_generation_result(RequestGuid const &guid);
-  RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length,
-                                   PEFTModelID peft_model_id);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length,
-                                   PEFTModelID peft_model_id);
-  RequestGuid register_new_peft_request(
-      std::vector<std::pair<std::string, std::string>> const &dataset,
-      int max_sequence_length,
-      PEFTModelID peft_model_id);
+  RequestGuid register_new_request(Request const &request_);
+  RequestGuid register_new_peft_request(Request const &request_);
   bool is_request_completed(RequestGuid const &guid);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f6a393a9ff..caaa54683a 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -76,8 +76,7 @@ inline int GET_BLOCKS(int const N) {
 }
 
 template <typename DT>
-__global__ void
-    scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
+__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
 
 __global__ void ones_kernel(float *ptr, Legion::coord_t size);
 
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 2313eca385..01bbdc3d2b 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -279,23 +279,28 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-    std::vector<std::string> prompts;
-    std::vector<std::pair<std::string, std::string>> dataset;
+
+    std::vector<Request> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      // // Add inference request
+      // Request inference_req;
+      // inference_req.prompt = text;
+      // inference_req.max_sequence_length = 128;
+      // inference_req.peft_model_id = peft_model_id;
+      // requests.push_back(inference_req);
+      // total_num_requests++;
+      // Add fine-tuning request
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = 128;
+      fine_tuning_req.peft_model_id = peft_model_id;
+      fine_tuning_req.dataset_text.push_back(std::make_pair(text, text));
+      requests.push_back(fine_tuning_req);
       total_num_requests++;
-      // prompts.push_back(text);
-      dataset.push_back(std::make_pair(text, text));
     }
-    rm->register_new_peft_request(
-        dataset, 256 /*max_sequence_length*/, peft_model_id);
-    //  for (auto &prompt : prompts) {
-    //    GenerationResult result = model.generate(prompt, 128
-    //    /*max_sequence_length*/);
-    //  }
-    GenerationResult result =
-        model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id);
+    GenerationResult result = model.generate(requests);
   }
 
   // Execution fence
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 8b0eb926d9..f6de22a376 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -393,15 +393,18 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
 
-    std::vector<std::string> prompts;
+    std::vector<Request> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      // Add inference request
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
-      prompts.push_back(text);
-      // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
-    tree_model.generate(prompts, 128 /*max_sequence_length*/);
+    tree_model.generate(requests);
   }
 
   // Execution fence
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 80202f6f99..8f5d197eb3 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1588,10 +1588,16 @@ flexflow_generation_result_t
                             int max_seq_length,
                             int *output_length_and_tokens) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  std::vector<std::string> prompts;
+
   std::string const text_str(input_text);
-  prompts.push_back(input_text);
-  GenerationResult result = handle->generate(prompts, max_seq_length);
+
+  std::vector<Request> requests;
+  Request inference_req;
+  inference_req.prompt = text_str;
+  inference_req.max_sequence_length = max_seq_length;
+  requests.push_back(inference_req);
+
+  GenerationResult result = handle->generate(requests);
   DEBUG_PRINT(
       "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length);
   assert(result.output_tokens.size() <= max_seq_length);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3a520213f5..13e829a823 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -176,81 +176,22 @@ size_t RequestManager::get_num_ssms() {
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::vector<TokenId> const &prompt,
-                                         int max_sequence_length,
-                                         PEFTModelID peft_model_id) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
-  // Add a new request
-  Request request;
-  request.status = Request::PENDING;
-  request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-  request.peft_model_id = peft_model_id;
-  if (prompt.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << prompt.size() << ".\n";
-
-    printf("tokens size: %zu\n", request.tokens.size());
-    return 0;
-  } else {
-    request.initial_len = prompt.size();
-    request.tokens = prompt;
-  }
-
-  if (get_num_ssms() == 0) {
-    std::cout << "No small speculative model registered, using incremental "
-                 "decoding."
-              << std::endl;
-  } else {
-    std::cout << "Num of models: " << get_num_ssms() << std::endl;
-    for (int i = 0; i < get_num_ssms(); i++) {
-      BeamTree beam_tree = BeamTree{};
-      request.beam_trees.push_back(beam_tree);
-    }
-  }
-
-  pending_infr_request_queue.push(request);
-  all_requests[request.guid] = request;
-
-  if (verbose) {
-    std::cout << "new req: " << request.tokens.size() << std::endl;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      std::cout << i << " : " << request.tokens[i] << std::endl;
-    }
-  }
-
-  GenerationResult gr;
-  gr.guid = request.guid;
-  gr.input_text = "";
-  gr.input_tokens = prompt;
-  gr.output_text = "";
-  gr.output_tokens = prompt;
-  request_generation_results[request.guid] = gr;
-
-  return request.guid;
-}
-
-RequestManager::RequestGuid
-    RequestManager::register_new_request(std::string const &prompt,
-                                         int max_sequence_length,
-                                         PEFTModelID peft_model_id) {
+    RequestManager::register_new_request(Request const &request_) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-  request.peft_model_id = peft_model_id;
+  request.max_sequence_length = request_.max_sequence_length;
+  request.peft_model_id = request_.peft_model_id;
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
-  std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
+  std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
   if (tokens.size() >= get_max_sequence_length()) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
               << get_max_sequence_length() << " tokens, but got "
-              << tokens.size() << ".\n";
+              << request_.tokens.size() << ".\n";
 
     printf("tokens size: %zu\n", tokens.size());
     return 0;
@@ -286,29 +227,27 @@ RequestManager::RequestGuid
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = prompt;
+  gr.input_text = request_.prompt;
   gr.input_tokens = request.tokens;
-  gr.output_text = prompt;
+  gr.output_text = request_.prompt;
   gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
   return request.guid;
 }
 
-RequestManager::RequestGuid RequestManager::register_new_peft_request(
-    std::vector<std::pair<std::string, std::string>> const &dataset,
-    int max_sequence_length,
-    PEFTModelID peft_model_id) {
+RequestManager::RequestGuid
+    RequestManager::register_new_peft_request(Request const &request_) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-  request.peft_model_id = peft_model_id;
+  request.max_sequence_length = request_.max_sequence_length;
+  request.peft_model_id = request_.peft_model_id;
   request.req_type = Request::REQ_FINETUNING;
   request.completed_training_steps = 0;
   request.max_training_steps = 1; // TODO: let user set this
-  for (auto const &sample : dataset) {
+  for (auto const &sample : request_.dataset_text) {
     std::vector<int32_t> input_tokens;
     input_tokens = this->tokenizer_->Encode(sample.first);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
@@ -321,6 +260,7 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request(
       std::cout << "Warning: too many tokens in sample, only load up to "
                 << get_max_sequence_length() << " tokens, but got "
                 << input_tokens.size() + output_tokens.size() << ".\n";
+      return 0;
     } else {
       request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
     }
@@ -688,9 +628,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       for (size_t i = 0; i < request.dataset[0].second.size(); i++) {
         new_bc.labelsInfo[new_bc.num_peft_label_tokens].token_id =
             request.dataset[0].second[i];
-        new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index = peft_req_idx;
+        new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index =
+            peft_req_idx;
         int depth = request.dataset[0].first.size() + i;
-        new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request = depth;
+        new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request =
+            depth;
         new_bc.num_peft_label_tokens++;
       }
     }
@@ -2086,26 +2028,20 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
   return merged_tree;
 }
 
-GenerationResult FFModel::generate(std::string const &prompt,
-                                   int max_seq_length,
-                                   PEFTModelID peft_model_id) {
-  std::vector<std::string> prompts;
-  prompts.push_back(prompt);
-  return generate(prompts, max_seq_length, peft_model_id);
+GenerationResult FFModel::generate(Request const &request) {
+  std::vector<Request> requests;
+  requests.push_back(request);
+  return generate(requests);
 }
 
-GenerationResult FFModel::generate(std::vector<std::string> const &prompts,
-                                   int max_seq_length,
-                                   PEFTModelID peft_model_id) {
+GenerationResult FFModel::generate(std::vector<Request> const &requests) {
   RequestManager *rm = RequestManager::get_request_manager();
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
-    return rm->generate_incr_decoding(
-        this, prompts, max_seq_length, peft_model_id);
+    return rm->generate_incr_decoding(this, requests);
   } else {
     // Registered SSMs: perform speculative inference
-    return rm->generate_spec_infer(
-        this, prompts, max_seq_length, peft_model_id);
+    return rm->generate_spec_infer(this, requests);
   }
 }
 
@@ -2213,14 +2149,15 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
 
 /*static*/
 GenerationResult RequestManager::generate_incr_decoding(
-    FFModel *llm,
-    std::vector<std::string> const &prompts,
-    int max_seq_length,
-    PEFTModelID peft_model_id) {
+    FFModel *llm, std::vector<Request> const &requests) {
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestGuid guid;
-  for (int i = 0; i < prompts.size(); i++) {
-    guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id);
+  for (int i = 0; i < requests.size(); i++) {
+    if (requests.at(i).req_type == Request::REQ_INFERENCE) {
+      guid = register_new_request(requests.at(i));
+    } else {
+      guid = register_new_peft_request(requests.at(i));
+    }
   }
 
   if (guid == 0) {
@@ -2230,7 +2167,8 @@ GenerationResult RequestManager::generate_incr_decoding(
     return GenerationResult();
   }
 
-  int tokens_to_generate = max_seq_length - all_requests[guid].tokens.size();
+  int tokens_to_generate =
+      all_requests[guid].max_sequence_length - all_requests[guid].tokens.size();
   std::queue<std::pair<BatchConfigFuture, InferenceResultFuture>>
       batch_pipeline;
   { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); }
@@ -2275,13 +2213,15 @@ GenerationResult RequestManager::generate_incr_decoding(
 /*static*/
 GenerationResult
     RequestManager::generate_spec_infer(FFModel *llm,
-                                        std::vector<std::string> const &prompts,
-                                        int max_seq_length,
-                                        PEFTModelID peft_model_id) {
+                                        std::vector<Request> const &requests) {
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestGuid guid;
-  for (int i = 0; i < prompts.size(); i++) {
-    guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id);
+  for (int i = 0; i < requests.size(); i++) {
+    if (requests.at(i).req_type == Request::REQ_INFERENCE) {
+      guid = register_new_request(requests.at(i));
+    } else {
+      guid = register_new_peft_request(requests.at(i));
+    }
   }
   if (guid == 0) {
     std::cout

From 1e5bb7202228831f469f06f469a02ab0439bfc84 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 20 Nov 2023 22:14:01 -0500
Subject: [PATCH 098/198] linting

---
 src/ops/inc_multihead_self_attention.cpp      | 28 ++++----
 src/ops/inc_multihead_self_attention.cu       | 66 +++++++++----------
 src/ops/kernels/linear_kernels.cpp            | 42 ++++++------
 src/ops/kernels/linear_kernels.cu             | 44 ++++++-------
 src/ops/kernels/lora_linear_kernels.cu        | 44 ++++++-------
 src/ops/spec_inc_multihead_self_attention.cpp | 14 ++--
 src/ops/spec_inc_multihead_self_attention.cu  | 22 +++----
 src/ops/tree_inc_multihead_self_attention.cpp | 14 ++--
 src/ops/tree_inc_multihead_self_attention.cu  | 22 +++----
 9 files changed, 148 insertions(+), 148 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 188659bea0..d38f93558e 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -258,13 +258,13 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   hipblasDatatype_t compute_type = hipblas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #else
-//   // TODO: currently use the hipblas_data_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
   // Input: qSize x num_tokens
@@ -512,13 +512,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   hipblasDatatype_t compute_type = hipblas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #else
-//   // TODO: currently use the hipblas_data_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index e597c7de97..54713769a0 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -239,17 +239,17 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudaDataType_t compute_type = cublas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
   // Weights: qSize x qProjSize x 3 x num_q_heads
   // Input: qSize x num_tokens
@@ -468,17 +468,17 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   cudaDataType_t compute_type = cublas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -886,17 +886,17 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m,
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   cudaDataType_t compute_type = cublas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 4fa8ab244f..40533805d3 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -275,13 +275,13 @@ void forward_kernel(LinearMeta const *m,
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   hipblasDatatype_t compute_type = output_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = output_type;
-// #else
-//   // TODO: currently use the output_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = output_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = output_type;
+  // #else
+  //   // TODO: currently use the output_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = output_type;
+  // #endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
                           HIPBLAS_OP_N,
@@ -372,13 +372,13 @@ void peft_bwd_kernel(LinearMeta const *m,
   // update input_grad_ptr offset
   input_grad_ptr = static_cast<DT *>(input_grad_ptr) + num_infr_tokens;
   hipblasDatatype_t compute_type = hipblas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #else
-//   // TODO: currently use the hipblas_data_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = output_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = output_type;
+  // #endif
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(m->output_type[0],
@@ -443,13 +443,13 @@ void backward_kernel(LinearMeta const *m,
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   hipblasDatatype_t compute_type = output_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = output_type;
-// #else
-//   // TODO: currently use output_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = output_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = output_type;
+  // #else
+  //   // TODO: currently use output_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = output_type;
+  // #endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 248e59bdeb..b41f5b3213 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -366,17 +366,17 @@ void forward_kernel(LinearMeta const *m,
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
   cudaDataType_t compute_type = output_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
                          CUBLAS_OP_N,
@@ -540,17 +540,17 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   cudaDataType_t compute_type = output_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
 
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 85a5d9990f..7be949a0d3 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -146,17 +146,17 @@ void inference_kernel(LoraLinearMeta *m,
   assert(input_type == output_type);
   cudaDataType_t weight_type = output_type;
   cudaDataType_t compute_type = output_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = output_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->input_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   int num_peft_requests = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
@@ -269,17 +269,17 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
   cudaDataType_t compute_type = output_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = output_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index d827a79c22..aebd5e8892 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -201,13 +201,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   hipblasDatatype_t compute_type = hipblas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #else
-//   // TODO: currently use the hipblas_data_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 999492f7c3..10c544f2a9 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -216,17 +216,17 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   cudaDataType_t compute_type = cublas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   // int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index d385880a74..03e0ac6441 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -158,13 +158,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   hipblasDatatype_t compute_type = hipblas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #else
-//   // TODO: currently use the hipblas_data_type
-//   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   hipblasDatatype_t compute_type = hipblas_data_type;
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index fc3d1fda72..6b38f99b87 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -159,17 +159,17 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
   cudaDataType_t compute_type = cublas_data_type;
-// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-//   cudaDataType_t compute_type = cublas_data_type;
-// #else
-//   // For best performance, set the default cublas compute type to
-//   // CUBLAS_COMPUTE_16F for half precision and to
-//   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-//   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-//   if (m->output_type[0] == DT_FLOAT) {
-//     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-//   }
-// #endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =

From f3ff40b49abdbc802810224dc57377fd6d5c06be Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 20 Nov 2023 22:40:08 -0500
Subject: [PATCH 099/198] alignment fixes in lora & linear layer

---
 include/flexflow/operator.h              | 41 +++++++++++++++---------
 inference/incr_decoding/incr_decoding.cc |  2 +-
 src/ops/kernels/linear_kernels.cu        |  5 +--
 src/ops/kernels/lora_linear_kernels.cu   | 12 +++----
 src/ops/linear.cc                        |  6 ++++
 src/ops/sigmoid_silu_multi.cu            |  7 ++--
 6 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index af39412232..e3f28756ec 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -243,6 +243,20 @@ class Op {
   }
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
+  static std::string get_op_name_without_uid(OpMetaType *m) {
+    std::string op_name_without_uid = std::string(m->op_name);
+    size_t last_underscore = op_name_without_uid.length() - 1;
+    for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+      if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+        break;
+      } else if (m->op_name[i] == '_') {
+        last_underscore = i;
+      }
+    }
+    op_name_without_uid.erase(last_underscore);
+    return op_name_without_uid;
+  }
+  template <typename OpMetaType>
   static void save_inference_tensors_to_file(
       OpMetaType *m,
       int shard_id,
@@ -250,7 +264,8 @@ class Op {
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
       std::vector<GenericTensorAccessorR> output_tensors,
-      bool fwd_pass = true) {
+      bool fwd_pass = true,
+      bool before_kernel = false) {
     // Check if output directory exists, and create it if it does not
     char const *folder_path = "./inference_tensors";
     struct stat st = {0};
@@ -259,16 +274,7 @@ class Op {
       mkdir(folder_path, 0700);
     }
     // output base filepath, shared by all tensors from the same operator
-    std::string op_name_without_uid = std::string(m->op_name);
-    size_t last_underscore = op_name_without_uid.length() - 1;
-    for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
-      if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
-        break;
-      } else if (m->op_name[i] == '_') {
-        last_underscore = i;
-      }
-    }
-    op_name_without_uid.erase(last_underscore);
+    std::string op_name_without_uid = get_op_name_without_uid(m);
     std::string base_filepath =
         "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
         (fwd_pass ? "_decoding-step_" : "_bwd-step_") +
@@ -277,6 +283,9 @@ class Op {
         "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
         "_layer-name_" + op_name_without_uid + "_shard-id_" +
         std::to_string(shard_id);
+    if (before_kernel) {
+      base_filepath += "_pre";
+    }
     // save batch config, if passed
     if (bc != nullptr) {
       bc->save_to_file(base_filepath + "_batch-config");
@@ -353,10 +362,12 @@ class Op {
       }
     }
     // increase count of decoding steps
-    if (fwd_pass) {
-      m->decoding_step++;
-    } else {
-      m->bwd_step++;
+    if (!before_kernel) {
+      if (fwd_pass) {
+        m->decoding_step++;
+      } else {
+        m->bwd_step++;
+      }
     }
   }
   virtual bool measure_operator_cost(Simulator *sim,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 01bbdc3d2b..cf92e6834a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -296,7 +296,7 @@ void FlexFlow::top_level_task(Task const *task,
       fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
       fine_tuning_req.max_sequence_length = 128;
       fine_tuning_req.peft_model_id = peft_model_id;
-      fine_tuning_req.dataset_text.push_back(std::make_pair(text, text));
+      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
       requests.push_back(fine_tuning_req);
       total_num_requests++;
     }
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index b41f5b3213..8cf5db3f11 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -493,8 +493,9 @@ void peft_bwd_kernel(LinearMeta const *m,
   }
 
   // Compute data gradient
-  // NOTE: we use alpha=1 for input_grad to accumulate gradients
-  DT alpha = 1.0f, beta = 0.0f;
+  // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+  DT alpha = 1.0f;
+  DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 7be949a0d3..8fb502bf10 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -262,7 +262,6 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                      ffStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == output_type);
@@ -300,7 +299,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
         m->model_weights[bc->requestsInfo[i].peft_model_id];
     int rank = weight.rank;
     // Compute w1's gradient
-    // NOTE: we use alpha=1 for w1_grad to accumulate gradients
+    DT alpha = 1.0f, beta = 0.0f;
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
@@ -314,7 +313,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            output_grad_ptr,
                            output_type,
                            out_dim,
-                           &alpha,
+                           &beta,
                            weight.w1_grad_ptr,
                            weight_type,
                            rank,
@@ -322,7 +321,6 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     // Compute gradients w.r.t. low_rank activation
     // and save the results to low_rank_activation
-    // NOTE: we use alpha=1 for input_grad to accumulate gradients
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
@@ -336,14 +334,13 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            output_grad_ptr,
                            output_type,
                            out_dim,
-                           &alpha,
+                           &beta,
                            m->low_rank_activation,
                            lr_actv_type,
                            rank,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     // Compute w0's gradient
-    // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
@@ -364,8 +361,9 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     // Compute input gradient
-    // NOTE: we use alpha=1 for input_grad to accumulate gradients
+    // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
     if (input_grad_ptr != nullptr) {
+      beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
                              CUBLAS_OP_N,
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 0887b6d35b..fa74e22fc6 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -757,6 +757,12 @@ void Linear::peft_bwd_task(Task const *task,
 
   int num_infr_tokens = bc->num_active_infr_tokens();
   int num_peft_tokens = bc->num_active_peft_tokens();
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Linear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false, true);
+  }
   peft_bwd_kernel_wrapper(m,
                           input_grad.ptr,
                           output_grad.ptr,
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index bb78973f70..60eb699496 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -57,9 +57,12 @@ __global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
     sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
 
     T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
-    input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val);
+    // input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] *
+    // T(sigmoid_val);
+    input2_grad_ptr[i] = output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val);
 
-    input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    // input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
     T sig_grad = ss_grad_val * input1_ptr[i];
 
     float x1_grad_val = static_cast<float>(sig_grad);

From 7efd3a7ce7b708154a739f6648293609e1049c21 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 21 Nov 2023 00:46:43 -0500
Subject: [PATCH 100/198] alignment fix

---
 src/ops/kernels/lora_linear_kernels.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 8fb502bf10..9cd5d2ecfa 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -292,7 +292,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_weights.end());
     LoraLinearWeight weight =
@@ -354,7 +354,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                            m->low_rank_activation,
                            lr_actv_type,
                            rank,
-                           &alpha,
+                           &beta,
                            weight.w0_grad_ptr,
                            weight_type,
                            in_dim,
@@ -377,7 +377,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                              m->low_rank_activation,
                              lr_actv_type,
                              rank,
-                             &alpha,
+                             &beta,
                              input_grad_ptr,
                              input_type,
                              in_dim,

From b6fe334c4364851b4dbf89c981973e454d802d88 Mon Sep 17 00:00:00 2001
From: xinhaoc <chengxh_98@163.com>
Date: Wed, 22 Nov 2023 05:20:08 +0000
Subject: [PATCH 101/198] diagonal

---
 src/ops/inc_multihead_self_attention.cu | 48 ++++++++++++++++---------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 54713769a0..28b94fe805 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -223,6 +223,23 @@ __global__ void
   }
 }
 
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+  }
+}
+
 template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
@@ -658,6 +675,20 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                       m->qk_tensor,
                                       m->qk_prods));
       //  TODO: fill all elements above diagonal to force causal attention
+      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
+      if (entries_above_diagonal > 0) {
+        size_t parallelism = m->num_q_heads * entries_above_diagonal;
+        fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
+                                      min((size_t)CUDA_NUM_THREADS,
+                                          parallelism),
+                                      0,
+                                      stream>>>(static_cast<DT *>(m->qk_prods),
+                                                num_tokens,
+                                                num_tokens,
+                                                m->num_q_heads,
+                                                entries_above_diagonal,
+                                                DT(0.0f));
+      }
     }
     // Step 5: compute gradients w.r.t. key
     {
@@ -855,23 +886,6 @@ __global__ void store_query_cache(DT const *devQKVProjArray,
   }
 }
 
-template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
-  }
-}
-
 template <typename DT>
 void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m,
                               BatchConfig const *bc,

From bcf8b1930f165901855b345737b54bb3b9da83f3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 22 Nov 2023 16:44:18 -0500
Subject: [PATCH 102/198] fix

---
 inference/incr_decoding/incr_decoding.cc | 28 ++++++++++++------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index cf92e6834a..f1a51aa670 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -284,21 +284,21 @@ void FlexFlow::top_level_task(Task const *task,
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      // // Add inference request
-      // Request inference_req;
-      // inference_req.prompt = text;
-      // inference_req.max_sequence_length = 128;
-      // inference_req.peft_model_id = peft_model_id;
-      // requests.push_back(inference_req);
-      // total_num_requests++;
-      // Add fine-tuning request
-      Request fine_tuning_req;
-      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = 128;
-      fine_tuning_req.peft_model_id = peft_model_id;
-      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
-      requests.push_back(fine_tuning_req);
+      // Add inference request
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      inference_req.peft_model_id = peft_model_id;
+      requests.push_back(inference_req);
       total_num_requests++;
+      // Add fine-tuning request
+      // Request fine_tuning_req;
+      // fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+      // fine_tuning_req.max_sequence_length = 128;
+      // fine_tuning_req.peft_model_id = peft_model_id;
+      // fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
+      // requests.push_back(fine_tuning_req);
+      // total_num_requests++;
     }
     GenerationResult result = model.generate(requests);
   }

From 4bfee967f4c19b3427c2db5928baa66570dcca75 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 22 Nov 2023 17:20:45 -0500
Subject: [PATCH 103/198] alignment fix ssm

---
 src/ops/sigmoid_silu_multi.cu | 41 ++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 60eb699496..21940fd7d0 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -51,23 +51,30 @@ __global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
                                                T const *input1_ptr,
                                                T const *input2_ptr,
                                                T *input1_grad_ptr,
-                                               T *input2_grad_ptr) {
+                                               T *input2_grad_ptr,
+                                               bool reset_input_grad1,
+                                               bool reset_input_grad2) {
   CUDA_KERNEL_LOOP(i, num_elements) {
     float sigmoid_val = static_cast<float>(input1_ptr[i]);
     sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
 
+    if (reset_input_grad2) {
+      input2_grad_ptr[i] =
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    } else {
+      input2_grad_ptr[i] +=
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    }
     T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
-    // input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] *
-    // T(sigmoid_val);
-    input2_grad_ptr[i] = output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val);
-
-    // input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
-    input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
+    if (reset_input_grad1) {
+      input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
+    } else {
+      input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    }
     T sig_grad = ss_grad_val * input1_ptr[i];
 
     float x1_grad_val = static_cast<float>(sig_grad);
-    x1_grad_val = exp(-x1_grad_val) /
-                  ((1.0f + exp(-sigmoid_val)) * (1.0f + exp(-sigmoid_val)));
+    x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val);
     input1_grad_ptr[i] += T(x1_grad_val);
   }
 }
@@ -226,7 +233,9 @@ void SigmoidSiluMulti::backward_kernel_wrapper(
                                                input1.get_float_ptr(),
                                                input2.get_float_ptr(),
                                                input1_grad.get_float_ptr(),
-                                               input1_grad.get_float_ptr());
+                                               input1_grad.get_float_ptr(),
+                                               m->reset_input_grads[0],
+                                               m->reset_input_grads[1]);
   } else if (m->input_type[0] == DT_HALF) {
     SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
                                      min(CUDA_NUM_THREADS, num_elements),
@@ -236,7 +245,9 @@ void SigmoidSiluMulti::backward_kernel_wrapper(
                                                input1.get_half_ptr(),
                                                input2.get_half_ptr(),
                                                input1_grad.get_half_ptr(),
-                                               input2_grad.get_half_ptr());
+                                               input2_grad.get_half_ptr(),
+                                               m->reset_input_grads[0],
+                                               m->reset_input_grads[1]);
   } else {
     assert(false && "unsupport datatype in SigmoidSiluMulti");
   }
@@ -307,7 +318,9 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
         static_cast<float const *>(m->input_activation) +
             num_peft_tokens * in_dim,
         input1_grad.get_float_ptr(),
-        input1_grad.get_float_ptr());
+        input1_grad.get_float_ptr(),
+        m->reset_input_grads[0],
+        m->reset_input_grads[1]);
   } else if (m->input_type[0] == DT_HALF) {
     SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
                                      min(CUDA_NUM_THREADS, num_elements),
@@ -319,7 +332,9 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
         static_cast<half const *>(m->input_activation) +
             num_peft_tokens * in_dim,
         input1_grad.get_half_ptr(),
-        input2_grad.get_half_ptr());
+        input2_grad.get_half_ptr(),
+        m->reset_input_grads[0],
+        m->reset_input_grads[1]);
   } else {
     assert(false && "unsupport datatype in SigmoidSiluMulti");
   }

From efd19769d7d6734aadfb2ba2ddd15caed01a008e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 24 Nov 2023 09:44:51 -0500
Subject: [PATCH 104/198] sigmoid-silu-multi now fully aligned

---
 src/ops/sigmoid_silu_multi.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 21940fd7d0..ec88042a1d 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -272,9 +272,8 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  int num_elements = output_grad.domain.get_volume();
-  assert(input1_grad.domain.get_volume() == num_elements);
-  assert(input2_grad.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume());
+  assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume());
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -306,19 +305,20 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
     assert(num_peft_tokens >= 1);
   }
   int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int num_elements = in_dim * num_peft_tokens;
 
   if (m->input_type[0] == DT_FLOAT) {
     SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
                                      min(CUDA_NUM_THREADS, num_elements),
                                      0,
                                      stream>>>(
-        output_grad.domain.get_volume(),
+        num_elements,
         output_grad.get_float_ptr(),
         static_cast<float const *>(m->input_activation),
         static_cast<float const *>(m->input_activation) +
             num_peft_tokens * in_dim,
         input1_grad.get_float_ptr(),
-        input1_grad.get_float_ptr(),
+        input2_grad.get_float_ptr(),
         m->reset_input_grads[0],
         m->reset_input_grads[1]);
   } else if (m->input_type[0] == DT_HALF) {
@@ -326,7 +326,7 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
                                      min(CUDA_NUM_THREADS, num_elements),
                                      0,
                                      stream>>>(
-        output_grad.domain.get_volume(),
+        num_elements,
         output_grad.get_half_ptr(),
         static_cast<half const *>(m->input_activation),
         static_cast<half const *>(m->input_activation) +

From 7ae195ac12baf62c82ce81d7872a485c4f867618 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 24 Nov 2023 11:49:44 -0500
Subject: [PATCH 105/198] rms norm kernel updates

---
 .../ops/kernels/residual_rms_norm_kernels.h   |   3 -
 .../flexflow/ops/kernels/rms_norm_kernels.h   |   4 -
 src/ops/kernels/residual_rms_norm_kernels.cu  |  92 +++++-----
 src/ops/kernels/rms_norm_kernels.cu           | 161 ++++++++----------
 4 files changed, 108 insertions(+), 152 deletions(-)

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 3091f83675..691f8ef8c1 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -32,9 +32,6 @@ class ResidualRMSNormMeta : public OpMeta {
   void *rms_ptr;
   void *norm_ptr;
 
-  float alpha;
-  float beta;
-
   int in_dim;
   int batch_size;
   int num_elements;
diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 92e5e04af3..46297764ec 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -31,10 +31,6 @@ class RMSNormMeta : public OpMeta {
   float eps;
   void *rms_ptr;
   void *norm_ptr;
-  void *c2_ptr;
-
-  float alpha;
-  float beta;
 
   int in_dim;
   int batch_size;
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 42a8747cbf..9ffbf1b3ba 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -24,16 +24,12 @@ namespace FlexFlow {
 using Legion::coord_t;
 
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
                                          ResidualRMSNorm const *rms,
                                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
@@ -96,25 +92,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
 template <typename T>
 __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
                                                   float eps,
@@ -359,7 +336,9 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N,
                                           T const *c1,
                                           T const *c2,
                                           T *dX1,
-                                          T *dX2) {
+                                          T *dX2,
+                                          bool reset_input_grad1,
+                                          bool reset_input_grad2) {
   const int64_t i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
@@ -367,10 +346,16 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N,
         static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
             static_cast<float>(gamma[j]) +
         static_cast<float>(c2[i]) * static_cast<float>(X[index]);
-    // dX1[index] += dX_val;
-    // dX2[index] += dX_val;
-    dX1[index] = static_cast<T>(dX_val);
-    dX2[index] = static_cast<T>(dX_val);
+    if (reset_input_grad1) {
+      dX1[index] = static_cast<T>(dX_val);
+    } else {
+      dX1[index] += dX_val;
+    }
+    if (reset_input_grad2) {
+      dX2[index] = static_cast<T>(dX_val);
+    } else {
+      dX2[index] += dX_val;
+    }
   }
 }
 
@@ -399,10 +384,10 @@ void backward_kernel(ResidualRMSNormMeta const *m,
                      T const *weight_ptr,
                      T *weight_grad_ptr,
                      cudaStream_t stream) {
-  const int64_t M = m->batch_size;
-  const int64_t N = m->num_elements;
+  int M = m->batch_size;
+  int N = m->in_dim;
   ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
           N,
           output_grad_ptr,
           residual_output_rms_input_ptr,
@@ -410,23 +395,25 @@ void backward_kernel(ResidualRMSNormMeta const *m,
           static_cast<T *>(m->rms_ptr),
           static_cast<T *>(m->norm_ptr));
 
-  RMSNormBackwardCUDAKernel<T>
-      <<<M, kCUDANumThreads, 0, stream>>>(N,
-                                          output_grad_ptr,
-                                          residual_output_rms_input_ptr,
-                                          weight_ptr,
-                                          static_cast<T *>(m->rms_ptr),
-                                          static_cast<T *>(m->norm_ptr),
-                                          residual_input0_grad_ptr,
-                                          residual_input1_grad_ptr);
-  const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
-  GammaBackwardCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          output_grad_ptr,
-                                          residual_output_rms_input_ptr,
-                                          static_cast<T *>(m->rms_ptr),
-                                          weight_grad_ptr);
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      N,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      residual_input0_grad_ptr,
+      residual_input1_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1]);
+
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
 }
 
 template <typename T>
@@ -450,8 +437,7 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m,
       continue;
     }
 
-    int M = m->batch_size; // TODO: replace with
-                           // m->requestsInfo[i].num_tokens_in_batch;
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
     int N = m->in_dim;
 
     T const *residual_output_rms_input_ptr =
@@ -468,14 +454,16 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m,
 
     RMSNormBackwardCUDAKernel<T>
         <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
-            m->in_dim,
+            N,
             output_grad_ptr,
             residual_output_rms_input_ptr,
             weight_ptr,
             static_cast<T *>(m->rms_ptr),
             static_cast<T *>(m->norm_ptr),
             residual_input0_grad_ptr,
-            residual_input1_grad_ptr);
+            residual_input1_grad_ptr,
+            m->reset_input_grads[0],
+            m->reset_input_grads[1]);
   }
 }
 
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index ae6a5d590d..d0702d651e 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -24,16 +24,12 @@ namespace FlexFlow {
 using Legion::coord_t;
 
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 RMSNormMeta::RMSNormMeta(FFHandler handler,
                          RMSNorm const *rms,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
@@ -41,15 +37,11 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
 
   DataType data_type = rms->weights[0]->data_type;
   size_t rms_ptr_size = batch_size;
-  size_t c2_ptr_size = rms_ptr_size;
   size_t norm_ptr_size = num_elements;
-  size_t totalSize =
-      (rms_ptr_size + c2_ptr_size + norm_ptr_size) * data_type_size(data_type);
+  size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
-  c2_ptr = gpu_mem_allocator.allocate_instance_untyped(
-      c2_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
 }
@@ -100,25 +92,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
 template <typename T>
 __global__ void RMSNormFusedForwardKernel(int64_t N,
                                           float eps,
@@ -130,16 +103,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -147,10 +115,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rms[i]);
+    Y[index] = static_cast<T>(X[index]) * static_cast<T>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
 }
@@ -162,24 +129,15 @@ void forward_kernel(RMSNormMeta const *m,
                     T *output_ptr,
                     cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   RMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+      <<<m->batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>(
+          m->in_dim,
+          m->eps,
+          input_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr),
+          weight_ptr,
+          output_ptr);
 }
 
 void forward_kernel_wrapper(RMSNormMeta const *m,
@@ -326,14 +284,20 @@ __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
   __shared__ T ds_storage[C10_WARP_SIZE];
   const int64_t i = blockIdx.x;
-  T ds = 0;
+  float ds = 0;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int const index = i * N + j;
-    ds += dY[index] * X[index] * gamma[j];
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
   }
   ds = BlockReduceSum<T>(ds, ds_storage);
   if (threadIdx.x == 0) {
-    c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast<T>((int)N);
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
   }
 }
 
@@ -344,11 +308,20 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N,
                                           T const *gamma,
                                           T const *c1,
                                           T const *c2,
-                                          T *dX) {
+                                          T *dX,
+                                          bool reset_input_grad) {
   const int64_t i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    dX[index] = c1[i] * dY[index] * gamma[j] + c2[i] * X[index];
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad) {
+      dX[index] = dX_val;
+    } else {
+      dX[index] += dX_val;
+    }
   }
 }
 
@@ -376,33 +349,33 @@ void backward_kernel(RMSNormMeta const *m,
                      T const *weight_ptr,
                      T *weight_grad_ptr,
                      cudaStream_t stream) {
-  const int64_t M = m->batch_size;
-  const int64_t N = m->num_elements;
+  int M = m->batch_size;
+  int N = m->in_dim;
   ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
           N,
           output_grad_ptr,
           input_ptr,
           weight_ptr,
           static_cast<T *>(m->rms_ptr),
-          static_cast<T *>(m->c2_ptr));
-
-  RMSNormBackwardCUDAKernel<T>
-      <<<M, kCUDANumThreads, 0, stream>>>(N,
-                                          output_grad_ptr,
-                                          input_ptr,
-                                          weight_ptr,
-                                          static_cast<T *>(m->rms_ptr),
-                                          static_cast<T *>(m->c2_ptr),
-                                          input_grad_ptr);
-  const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
-  GammaBackwardCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          output_grad_ptr,
-                                          input_ptr,
-                                          static_cast<T *>(m->rms_ptr),
-                                          weight_grad_ptr);
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      m->in_dim,
+      output_grad_ptr,
+      input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      input_grad_ptr,
+      m->reset_input_grads[0]);
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
 }
 
 void backward_kernel_wrapper(RMSNormMeta const *m,
@@ -475,24 +448,26 @@ void peft_bwd_kernel(RMSNormMeta const *m,
       continue;
     }
 
-    const int64_t M = bc->requestsInfo[i].num_tokens_in_batch;
-    const int64_t N = m->num_elements;
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->num_elements;
     ComputeInternalGradientsCUDAKernel<T>
-        <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
             N,
             output_grad_ptr,
             static_cast<T *>(m->input_activation),
             weight_ptr,
             static_cast<T *>(m->rms_ptr),
-            static_cast<T *>(m->c2_ptr));
-    RMSNormBackwardCUDAKernel<T><<<M, kCUDANumThreads, 0, stream>>>(
-        N,
-        output_grad_ptr,
-        static_cast<T *>(m->input_activation),
-        weight_ptr,
-        static_cast<T *>(m->rms_ptr),
-        static_cast<T *>(m->c2_ptr),
-        input_grad_ptr);
+            static_cast<T *>(m->norm_ptr));
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            m->in_dim,
+            output_grad_ptr,
+            static_cast<T *>(m->input_activation),
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_ptr,
+            m->reset_input_grads[0]);
   }
 }
 

From 703081444ed26c3132bd20fb375a07973019198d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 26 Nov 2023 11:02:08 -0500
Subject: [PATCH 106/198] fix

---
 src/ops/kernels/residual_rms_norm_kernels.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 9ffbf1b3ba..b12d105c1b 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -349,12 +349,12 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N,
     if (reset_input_grad1) {
       dX1[index] = static_cast<T>(dX_val);
     } else {
-      dX1[index] += dX_val;
+      dX1[index] += static_cast<T>(dX_val);
     }
     if (reset_input_grad2) {
-      dX2[index] = static_cast<T>(dX_val);
+      dX2[index] = static_cast<T>(dX1[index]);
     } else {
-      dX2[index] += dX_val;
+      dX2[index] += static_cast<T>(dX1[index]);
     }
   }
 }

From eb3b6abd500931fe7027e62e3d9c618f907a4f25 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 26 Nov 2023 11:03:12 -0500
Subject: [PATCH 107/198] in-place residual rms

---
 include/flexflow/ops/residual_rms_norm.h |  1 +
 src/ops/residual_rms_norm.cc             | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
index de6e6ea506..2acc06841c 100644
--- a/include/flexflow/ops/residual_rms_norm.h
+++ b/include/flexflow/ops/residual_rms_norm.h
@@ -32,6 +32,7 @@ class ResidualRMSNorm : public Op {
                   ResidualRMSNorm const &other,
                   Input const &inputs,
                   bool allocate_weights);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index a57b9248c7..953dd60242 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -234,6 +234,18 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
   }
 }
 
+void ResidualRMSNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  outputs[0]->parallel_is = inputs[0]->parallel_is;
+  outputs[0]->region = inputs[0]->region;
+  outputs[0]->part = inputs[0]->part;
+  outputs[0]->region_grad = inputs[0]->region_grad;
+  outputs[0]->part_grad = inputs[0]->part_grad;
+  // map output 1 to new region
+  ff.map_tensor(outputs[1], this);
+}
+
 void ResidualRMSNorm::init(FFModel const &ff) {
   assert(check_output_input_weight_same_parallel_is());
   parallel_is = outputs[0]->parallel_is;

From a122e306351ed585b3e585e44f05a85419372269 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 27 Nov 2023 23:32:09 -0500
Subject: [PATCH 108/198] bug fix and linting

---
 include/flexflow/batch_config.h              | 2 +-
 src/ops/inc_multihead_self_attention.cu      | 2 +-
 src/ops/spec_inc_multihead_self_attention.cu | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ed0104e05d..cc32afca84 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -65,7 +65,7 @@ class BatchConfig {
   int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
   // number of tokens in prompt phase, start offset of tokens in inc_decoding
   // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-  int num_generation_tokens=0;
+  int num_generation_tokens = 0;
 
   struct PerRequestInfo {
     PerRequestInfo() {
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index cd784c1a3c..ece7d47b58 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1285,7 +1285,7 @@ __global__ void store_query_cache(DT const *devQKVProjArray,
 }
 
 template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                      BatchConfig const *bc,
                                      int shard_id,
                                      DT const *bias_ptr,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 3ff0f5c80e..336fcb5c99 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -493,7 +493,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   // compute output production and bias together for all tokens
   int num_tokens =
       bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-        
+
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
 }

From 53e737b912c4c8368ae2aa645b4c9b19930159c3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 30 Nov 2023 16:19:18 -0500
Subject: [PATCH 109/198] align backward of o_proj, attn_heads,
 qk_prods_softmax, and v_proj with huggingface

---
 src/ops/inc_multihead_self_attention.cu | 83 +++++++++++++++++--------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ece7d47b58..f5288964e9 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -894,6 +894,26 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   //   }
   // #endif
+  std::string op_name_without_uid = std::string(m->op_name);
+  size_t last_underscore = op_name_without_uid.length() - 1;
+  for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+    if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+      break;
+    } else if (m->op_name[i] == '_') {
+      last_underscore = i;
+    }
+  }
+  op_name_without_uid.erase(last_underscore);
+
+  std::string base_filepath =
+        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
+        "_bwd-step_" + std::to_string(m->bwd_step) +
+        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
+        "_layer-name_" + op_name_without_uid + "_shard-id_" +
+        std::to_string(shard_id);
+
+
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -913,30 +933,31 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int vt_block_size = m->vProjSize;
     int vt_req_block_size =
         vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
     // Step 1: compute gradients before final projection
     {
       int m_ = m->vProjSize * m->num_q_heads;
       int n_ = num_tokens;
       int k_ = m->oProjSize;
-      int lda = k_;
+      int lda = m_;
       int ldb = k_;
       int ldc = m_;
       float alpha = 1.0f, beta = 0.0f;
       // matrix A: output projection weight
-      // matrix A's layout: [num_heads, vProjSize, oProjSize]
+      // matrix A's layout: [vProjSize * num_heads, oProjSize]
       DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
                                              m->kProjSize * m->num_q_heads +
                                              m->vProjSize * m->num_q_heads);
       // matrix B: output gradients
-      // matrix B's layout: [num_new_tokens, oProjSize]
+      // matrix B's layout: [oProjSize, num_new_tokens]
       DT const *B =
           output_grad_ptr +
           bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
       // matrix C: attn_heads gradients
-      // matrix C's layout: [num_new_tokens, num_heads, vProjSize]
+      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
       DT *C = static_cast<DT *>(m->handle.workSpace);
       checkCUDA(cublasGemmEx(m->handle.blas,
-                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
                              CUBLAS_OP_N,
                              m_,
                              n_,
@@ -954,33 +975,38 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      std::string filename = base_filepath + "_o_proj_in_grad";
+      std::cout << "FILENAME: " << filename << std::endl;
+      save_tensor(C, m_*n_, filename.c_str());
     }
     // Step 2: compute gradients w.r.t. value
     {
       float alpha = 1.0f, beta = 0.0f;
-      // matrix A: attn_heads gradients
-      // matrix A's layout: [num_tokens, num_heads, vProjSize]
-      DT const *A = static_cast<DT *>(m->handle.workSpace);
-      // matrix B: qk_prods_softmax
-      // matrix B's layout: [num_heads, num_tokens, num_tokens]
-      DT const *B = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix A: qk_prods_softmax
+      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
       // matrix C: gradients for value (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize +
-      // vProjSize]
-      DT *C =
-          static_cast<DT *>(m->devQKVProjArray) + m->qProjSize + m->kProjSize;
-      int m_ = m->vProjSize;
-      int n_ = num_tokens;
-      int k_ = num_tokens;
-      int lda = m->vProjSize * m->num_q_heads;
-      int ldb = num_tokens;
-      int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
-      int strideA = m->vProjSize;
-      int strideB = num_tokens * num_tokens;
-      int strideC = m->qProjSize + m->kProjSize + m->vProjSize;
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) + 2*(m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients
+      // after transpositions
+      int m_ = num_tokens; // total_tokens
+      int n_ = m->vProjSize; // num_new_tokens
+      int k_ = num_tokens; // num_new_tokens
+      // before transpositions
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // total_tokens
+      // N.B. strides are applied before transpose operations
+      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * m->vProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                            CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
                                            m_,
                                            n_,
                                            k_,
@@ -1001,6 +1027,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      std::string filename = base_filepath + "_v_proj_in_grad";
+      std::cout << "FILENAME: " << filename << std::endl;
+      save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
+      std::string filename2 = base_filepath + "_qk_prods_softmax";
+      std::cout << "FILENAME: " << filename2 << std::endl;
+      save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
     }
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {

From edc02af728380c4849d99fee0277e21c97c4358e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 30 Nov 2023 16:26:03 -0500
Subject: [PATCH 110/198] cleanup

---
 src/ops/inc_multihead_self_attention.cu | 31 -------------------------
 1 file changed, 31 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index f5288964e9..f54cd58408 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -894,26 +894,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   //   }
   // #endif
-  std::string op_name_without_uid = std::string(m->op_name);
-  size_t last_underscore = op_name_without_uid.length() - 1;
-  for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
-    if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
-      break;
-    } else if (m->op_name[i] == '_') {
-      last_underscore = i;
-    }
-  }
-  op_name_without_uid.erase(last_underscore);
-
-  std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_bwd-step_" + std::to_string(m->bwd_step) +
-        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
-        "_layer-name_" + op_name_without_uid + "_shard-id_" +
-        std::to_string(shard_id);
-
-
-
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -975,10 +955,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // save result to file for checking
-      std::string filename = base_filepath + "_o_proj_in_grad";
-      std::cout << "FILENAME: " << filename << std::endl;
-      save_tensor(C, m_*n_, filename.c_str());
     }
     // Step 2: compute gradients w.r.t. value
     {
@@ -1027,13 +1003,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // save result to file for checking
-      std::string filename = base_filepath + "_v_proj_in_grad";
-      std::cout << "FILENAME: " << filename << std::endl;
-      save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
-      std::string filename2 = base_filepath + "_qk_prods_softmax";
-      std::cout << "FILENAME: " << filename2 << std::endl;
-      save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
     }
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {

From f00c7e0b90ce582e260596b6048577cb993bcae3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 30 Nov 2023 17:07:57 -0500
Subject: [PATCH 111/198] finished all alignment fixes in attention backward
 kernel

---
 src/ops/inc_multihead_self_attention.cu | 122 ++++++++++++------------
 1 file changed, 63 insertions(+), 59 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index f54cd58408..b5ed032137 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1007,24 +1007,27 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {
       float alpha = 1.0f, beta = 0.0f;
-      int m_ = num_tokens;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: value cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
       int n_ = num_tokens;
       int k_ = m->vProjSize;
+      // before transposition and striding
       int lda = m->vProjSize * m->num_q_heads;
       int ldb = m->vProjSize * m->num_q_heads;
-      int ldc = num_tokens;
+      int ldc = num_tokens; // num_new_tokens
       int strideA = m->vProjSize;
       int strideB = m->vProjSize;
-      int strideC = num_tokens * num_tokens;
-      // matrix A: value cache
-      // matrix A's layout: [num_req, max_num_tokens, num_heads, vProjSize]
-      DT const *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // matrix B: attn_heads gradients
-      // matrix B's layout: [num_new_tokens, num_heads, vProjSize]
-      DT const *B = static_cast<DT *>(m->handle.workSpace);
-      // matrix C: qk_prods_softmax gradients
-      // matrix C's layout: [num_heads, num_total_tokens, num_new_tokens]
-      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                            CUBLAS_OP_T,
                                            CUBLAS_OP_N,
@@ -1096,27 +1099,28 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       if (*m->qk_prod_scaling) {
         alpha = 1.0f / sqrt(m->kProjSize);
       }
-      // matrix A: query activation (in query_activation_buffer)
-      // matrix A's layout: [num_tokens, num_heads, m->qProjSize]
-      DT const *A = static_cast<DT *>(m->query_activation_buffer);
-      // matrix B: gradients w.r.t. qk_prods
-      // matrix B's layout: [num_heads, num_tokens, num_tokens]
-      DT const *B = static_cast<DT *>(m->qk_prods);
-      // matrix C: gradients w.r.t. key (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize +
-      // vProjSize]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) + m->qProjSize;
-      int m_ = m->kProjSize;
-      int n_ = num_tokens;
-      int k_ = num_tokens;
-      int lda = m->num_q_heads * m->qProjSize;
-      int ldb = num_tokens;
-      int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
-      int strideA = m->qProjSize;
-      int strideB = num_tokens * num_tokens;
-      int strideC = m->qProjSize + m->kProjSize + m->vProjSize;
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: query activation (in query_activation_buffer)
+      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->query_activation_buffer);
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) + (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->kProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->kProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->kProjSize;
+      int strideC = num_tokens * m->kProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
                                            CUBLAS_OP_T,
                                            m_,
                                            n_,
@@ -1145,27 +1149,29 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       if (*m->qk_prod_scaling) {
         alpha = 1.0f / sqrt(m->kProjSize);
       }
-      // matrix A: key cache
-      // matrix A's layout: [num_tokens, num_heads, m->kProjSize]
-      DT const *A = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // matrix B: gradients w.r.t. qk_prods
-      // matrix B's layout: [num_heads, num_tokens, num_tokens]
-      DT const *B = static_cast<DT *>(m->qk_prods);
-      // matrix C: gradients w.r.t. query (saved as part of m->devQKVProjArray)
-      // matrix C's layout:
-      // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize]
-      DT *C = static_cast<DT *>(m->devQKVProjArray);
-      int m_ = m->qProjSize;
-      int n_ = num_tokens;
-      int k_ = num_tokens;
-      int lda = m->kProjSize * m->num_q_heads;
-      int ldb = num_tokens;
-      int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
-      int strideA = m->kProjSize;
-      int strideB = num_tokens * num_tokens;
-      int strideC = m->qProjSize + m->kProjSize + m->vProjSize;
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: key cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray)
+      // after transposition & striding
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->qProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->qProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
                                            CUBLAS_OP_T,
                                            m_,
                                            n_,
@@ -1195,26 +1201,24 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
         beta = 1.0f;
       }
       // matrix A: QKV projection weights
-      // matrix A's layout:
-      // [(qProjSize + kProjSize + vProjSize) * num_q_heads, qSize]
+      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
       DT const *A = weight_ptr;
       // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
-      // matrix B's layout:
-      // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize]
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
       DT const *B = static_cast<DT *>(m->devQKVProjArray);
       // matrix C: gradients w.r.t. input
-      // matrix C's layout: [num_tokens, m->qSize]
+      // matrix C's layout: [m->qSize, num_tokens]
       DT *C = input_grad_ptr +
               bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
       int m_ = m->qSize;
       int n_ = num_tokens;
       int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
       int lda = m_;
-      int ldb = k_;
+      int ldb = n_;
       int ldc = m_;
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
-                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
                              m_,
                              n_,
                              k_,

From 3955b0bebdfdc636c0947b1373fe66213d61691f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 30 Nov 2023 17:12:47 -0500
Subject: [PATCH 112/198] fix

---
 src/ops/inc_multihead_self_attention.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b5ed032137..ea60a48e75 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1157,7 +1157,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
       // matrix C: gradients for query (saved as part of m->devQKVProjArray)
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray)
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
       // after transposition & striding
       // after transposition & striding
       int m_ = num_tokens;

From c5346381bfb0489379eb6f429d066855adb62c1b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 3 Dec 2023 11:51:06 -0500
Subject: [PATCH 113/198] Update inc_multihead_self_attention.cu

---
 src/ops/inc_multihead_self_attention.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ea60a48e75..89f0c1f3e7 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -967,7 +967,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       DT const *B = static_cast<DT *>(m->handle.workSpace);
       // matrix C: gradients for value (saved as part of m->devQKVProjArray)
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) + 2*(m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients
+      DT *C = static_cast<DT *>(m->devQKVProjArray) + 2 * num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients
       // after transpositions
       int m_ = num_tokens; // total_tokens
       int n_ = m->vProjSize; // num_new_tokens
@@ -1107,7 +1107,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       DT const *B = static_cast<DT *>(m->query_activation_buffer);
       // matrix C: gradients for key (saved as part of m->devQKVProjArray)
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) + (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
+      DT *C = static_cast<DT *>(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
       // after transposition & striding
       int m_ = num_tokens;
       int n_ = m->kProjSize;

From fd956c95a8d9c719342c3a659ca4b258cc117012 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 4 Dec 2023 01:43:18 -0500
Subject: [PATCH 114/198] Update inc_multihead_self_attention.cu

---
 src/ops/inc_multihead_self_attention.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 89f0c1f3e7..e273e1bb6c 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1160,9 +1160,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       DT *C = static_cast<DT *>(m->devQKVProjArray);
       // after transposition & striding
       // after transposition & striding
-      int m_ = num_tokens;
+      int m_ = num_tokens; // num_new_tokens
       int n_ = m->qProjSize;
-      int k_ = num_tokens; // num_new_tokens
+      int k_ = num_tokens; 
       // before transposition and striding
       int lda = num_tokens; // num_new_tokens
       int ldb = m->qProjSize * m->num_q_heads;
@@ -1171,7 +1171,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       int strideB = m->qProjSize;
       int strideC = num_tokens * m->qProjSize;
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
                                            CUBLAS_OP_T,
                                            m_,
                                            n_,

From 3a34c88f901e5b3271f06682f19d08e1a052baff Mon Sep 17 00:00:00 2001
From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com>
Date: Tue, 5 Dec 2023 23:40:18 -0500
Subject: [PATCH 115/198] use grad to store peft in/output (#1241)

* use grad to store peft in/output

* format

* .
---
 src/ops/add_bias_residual_layer_norm.cc | 48 +++++++++--------------
 src/ops/fused.cc                        | 42 ++++++++------------
 src/ops/inc_multihead_self_attention.cc | 44 +++++++--------------
 src/ops/layer_norm.cc                   | 35 ++++++-----------
 src/ops/linear.cc                       | 45 ++++++++--------------
 src/ops/lora_linear.cc                  | 27 +++++--------
 src/ops/residual_layer_norm.cc          | 51 +++++++++----------------
 src/ops/residual_rms_norm.cc            | 43 ++++++++-------------
 src/ops/rms_norm.cc                     | 35 ++++++-----------
 src/ops/sigmoid_silu_multi.cc           | 14 +++----
 src/ops/softmax.cc                      | 27 +++++--------
 src/parallel_ops/allreduce.cc           | 24 ++++--------
 src/runtime/inference_manager.cc        |  6 +++
 13 files changed, 157 insertions(+), 284 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 82c71f517f..1f03d566ac 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -910,50 +910,36 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
   IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
+                         parallel_is, TaskArgument(NULL, 0), argmap,
+                         Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
   int field_id = 0;
   // output_grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // input grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // residual grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // attn bias grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[2]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   if (elementwise_affine) {
     // gamma
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[0]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
+                          EXCLUSIVE, weights[0]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index bbd99c5986..b7dbcaccb1 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -487,45 +487,33 @@ FutureMap FusedOp::inference(FFModel const &ff,
   // so we transfer the maximum of them
   // size_t batch_config_size =
   //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
-  IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is,
+                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
     assert(inputs[i]->region != LogicalRegion::NO_REGION);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[i]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[i]->part, 0 /*projection id*/, READ_ONLY,
+                          EXCLUSIVE, batch_inputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
   offset += numInputs;
   for (int i = 0; i < numWeights; i++) {
     assert(weights[i]->region != LogicalRegion::NO_REGION);
-    launcher.add_region_requirement(RegionRequirement(weights[i]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[i]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(weights[i]->part, 0 /*projection id*/, READ_ONLY,
+                          EXCLUSIVE, weights[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
   offset += numWeights;
   for (int i = 0; i < numOutputs; i++) {
     assert(outputs[i]->region != LogicalRegion::NO_REGION);
     launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[i]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[i]->region));
+        RegionRequirement(batch_outputs[i]->part, 0 /*projection id*/,
+                          WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
@@ -561,11 +549,11 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
     assert(inputs[i]->region != LogicalRegion::NO_REGION);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part,
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part_grad,
                                                       0 /*projection id*/,
                                                       READ_WRITE,
                                                       EXCLUSIVE,
-                                                      batch_inputs[i]->region));
+                                                      batch_inputs[i]->region_grad));
     launcher.add_field(offset + i, FID_DATA);
   }
   offset += numInputs;
@@ -582,11 +570,11 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
   for (int i = 0; i < numOutputs; i++) {
     assert(outputs[i]->region != LogicalRegion::NO_REGION);
     launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[i]->part,
+        RegionRequirement(batch_outputs[i]->part_grad,
                           0 /*projection id*/,
                           READ_WRITE,
                           EXCLUSIVE,
-                          batch_outputs[i]->region));
+                          batch_outputs[i]->region_grad));
     launcher.add_field(offset + i, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index b66d524303..66197b174e 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -891,42 +891,26 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd(
   size_t machine_view_hash = view->hash();
   int idx = 0;
   IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
+                         parallel_is, TaskArgument(nullptr, 0), argmap,
+                         Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+  launcher.add_region_requirement(RegionRequirement(
+      weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE,
+      weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
   if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(
-        RegionRequirement(weights[1]->part,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          weights[1]->region,
-                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+    launcher.add_region_requirement(RegionRequirement(
+        weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE,
+        weights[1]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
     launcher.add_field(idx++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index ba2d43022f..915bd0d1a7 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -661,36 +661,25 @@ Legion::FutureMap
   size_t machine_view_hash = view->hash();
   /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): output_grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[1](I/O): input_grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(2, FID_DATA);
   if (elementwise_affine) {
     // regions[2](I): gamma
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[0]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
+                          EXCLUSIVE, weights[0]->region));
     launcher.add_field(3, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index fa74e22fc6..13f2ae0a7a 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -688,41 +688,26 @@ FutureMap Linear::peft_bwd(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(
+      weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE,
+      weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
   launcher.add_field(2, FID_DATA);
   if (use_bias) {
-    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[1]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY,
+                          EXCLUSIVE, weights[1]->region));
     launcher.add_field(3, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 05edeab833..050349ccb7 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -577,26 +577,17 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
   MachineView const *view = mv ? mv : &output_tensor->machine_view;
   set_argumentmap_for_inference(ff, argmap, output_tensor);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 4bee47de6c..fe8f0094cb 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -701,53 +701,38 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
   int field_id = 0;
   // output_grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // input grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // residual grad 1
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   if (use_two_residuals) {
     // residual grad 2
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part,
-                                                      0 /*projection id*/,
-                                                      READ_WRITE,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[2]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/,
+                          READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad));
     launcher.add_field(field_id++, FID_DATA);
   }
   if (elementwise_affine) {
     // gamma
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[0]->region));
+    launcher.add_region_requirement(
+        RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
+                          EXCLUSIVE, weights[0]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 953dd60242..09e6327de7 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -630,42 +630,29 @@ Legion::FutureMap
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): RMS output_grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[2](I/O): residual input grad 0
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   // regions[3](I/O): residual input grad 1
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[1]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
   launcher.add_field(2, FID_DATA);
   // regions[4](I): gamma
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
+                        EXCLUSIVE, weights[0]->region));
   launcher.add_field(3, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 5a8cfe8eff..b2d3d4521b 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -527,35 +527,24 @@ Legion::FutureMap
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): output_grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[1](I/O): input_grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   // regions[2](I): weight
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
+                        EXCLUSIVE, weights[0]->region));
   launcher.add_field(2, FID_DATA);
 
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index d064bd0a1c..acca39ab33 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -373,25 +373,25 @@ FutureMap
                          machine_view_hash);
   launcher.add_future(bc);
   // output grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+                                                    batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   // input 1 grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+                                                    batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   // input 2 grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
-                                                    batch_inputs[1]->region));
+                                                    batch_inputs[1]->region_grad));
   launcher.add_field(2, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 88ffec3642..d852e09b46 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -399,26 +399,17 @@ FutureMap Softmax::peft_bwd(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, parallel_is,
+                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 7f147dad6f..78ce807aa6 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -355,24 +355,16 @@ FutureMap AllReduce::peft_bwd(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID,
                          batch_outputs[0]->parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
+                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
+                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
+                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 81a72a5c12..39d3ecdf81 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -229,6 +229,12 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
                                              pt_base->region.get_field_space());
           pt->part = runtime->get_logical_partition(
               ctx, pt->region, pt_base->part.get_index_partition());
+
+          pt->region_grad = runtime->create_logical_region(
+              ctx, pt_base->region.get_index_space(),
+              pt_base->region.get_field_space());
+          pt->part_grad = runtime->get_logical_partition(
+              ctx, pt->region_grad, pt_base->part.get_index_partition());
           pt->machine_view = machine_views[j];
           // std::cout << "output mv: " << pt->machine_view << std::endl;
           Domain part_domain =

From 94230d92c54c574a96a44995bc9b52e64e1a1341 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 5 Dec 2023 23:41:12 -0500
Subject: [PATCH 116/198] format

---
 src/ops/add_bias_residual_layer_norm.cc | 44 ++++++++++++++++-------
 src/ops/fused.cc                        | 45 ++++++++++++++---------
 src/ops/inc_multihead_self_attention.cc | 42 +++++++++++++++-------
 src/ops/inc_multihead_self_attention.cu | 19 ++++++----
 src/ops/layer_norm.cc                   | 33 +++++++++++------
 src/ops/linear.cc                       | 43 +++++++++++++++-------
 src/ops/lora_linear.cc                  | 25 +++++++++----
 src/ops/residual_layer_norm.cc          | 47 +++++++++++++++++--------
 src/ops/residual_rms_norm.cc            | 40 ++++++++++++++-------
 src/ops/rms_norm.cc                     | 33 +++++++++++------
 src/ops/sigmoid_silu_multi.cc           | 33 +++++++++--------
 src/ops/softmax.cc                      | 25 +++++++++----
 src/parallel_ops/allreduce.cc           | 22 ++++++++----
 src/runtime/inference_manager.cc        |  7 ++--
 14 files changed, 314 insertions(+), 144 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 1f03d566ac..be7b357f23 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -910,36 +910,54 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
   IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
-                         parallel_is, TaskArgument(NULL, 0), argmap,
-                         Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
   int field_id = 0;
   // output_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad));
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // input grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // residual grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // attn bias grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad));
+      RegionRequirement(batch_inputs[2]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[2]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   if (elementwise_affine) {
     // gamma
-    launcher.add_region_requirement(
-        RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
-                          EXCLUSIVE, weights[0]->region));
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index b7dbcaccb1..ea1c970cc5 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -487,33 +487,45 @@ FutureMap FusedOp::inference(FFModel const &ff,
   // so we transfer the maximum of them
   // size_t batch_config_size =
   //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
-  IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is,
-                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
     assert(inputs[i]->region != LogicalRegion::NO_REGION);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_inputs[i]->part, 0 /*projection id*/, READ_ONLY,
-                          EXCLUSIVE, batch_inputs[i]->region));
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
   offset += numInputs;
   for (int i = 0; i < numWeights; i++) {
     assert(weights[i]->region != LogicalRegion::NO_REGION);
-    launcher.add_region_requirement(
-        RegionRequirement(weights[i]->part, 0 /*projection id*/, READ_ONLY,
-                          EXCLUSIVE, weights[i]->region));
+    launcher.add_region_requirement(RegionRequirement(weights[i]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
   offset += numWeights;
   for (int i = 0; i < numOutputs; i++) {
     assert(outputs[i]->region != LogicalRegion::NO_REGION);
     launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[i]->part, 0 /*projection id*/,
-                          WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region));
+        RegionRequirement(batch_outputs[i]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
@@ -549,11 +561,12 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
     assert(inputs[i]->region != LogicalRegion::NO_REGION);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part_grad,
-                                                      0 /*projection id*/,
-                                                      READ_WRITE,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[i]->region_grad));
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[i]->part_grad,
+                          0 /*projection id*/,
+                          READ_WRITE,
+                          EXCLUSIVE,
+                          batch_inputs[i]->region_grad));
     launcher.add_field(offset + i, FID_DATA);
   }
   offset += numInputs;
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 66197b174e..ca6eb7c095 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -891,26 +891,44 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd(
   size_t machine_view_hash = view->hash();
   int idx = 0;
   IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
-                         parallel_is, TaskArgument(nullptr, 0), argmap,
-                         Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(
-      weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE,
-      weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
   launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
   if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(RegionRequirement(
-        weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE,
-        weights[1]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+    launcher.add_region_requirement(
+        RegionRequirement(weights[1]->part,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          weights[1]->region,
+                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
     launcher.add_field(idx++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index cd7cecaf91..baa24b7c00 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -967,11 +967,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       DT const *B = static_cast<DT *>(m->handle.workSpace);
       // matrix C: gradients for value (saved as part of m->devQKVProjArray)
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) + 2 * num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients
+      DT *C = static_cast<DT *>(m->devQKVProjArray) +
+              2 * num_tokens *
+                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
+                                                   // for Q and K gradients
       // after transpositions
-      int m_ = num_tokens; // total_tokens
+      int m_ = num_tokens;   // total_tokens
       int n_ = m->vProjSize; // num_new_tokens
-      int k_ = num_tokens; // num_new_tokens
+      int k_ = num_tokens;   // num_new_tokens
       // before transpositions
       int lda = num_tokens; // num_new_tokens
       int ldb = m->vProjSize * m->num_q_heads;
@@ -1027,7 +1030,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       int strideA = m->vProjSize;
       int strideB = m->vProjSize;
       int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
-      
+
       checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                            CUBLAS_OP_T,
                                            CUBLAS_OP_N,
@@ -1107,7 +1110,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       DT const *B = static_cast<DT *>(m->query_activation_buffer);
       // matrix C: gradients for key (saved as part of m->devQKVProjArray)
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
       // after transposition & striding
       int m_ = num_tokens;
       int n_ = m->kProjSize;
@@ -1162,7 +1169,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       // after transposition & striding
       int m_ = num_tokens; // num_new_tokens
       int n_ = m->qProjSize;
-      int k_ = num_tokens; 
+      int k_ = num_tokens;
       // before transposition and striding
       int lda = num_tokens; // num_new_tokens
       int ldb = m->qProjSize * m->num_q_heads;
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 915bd0d1a7..d4b5d6a543 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -661,25 +661,38 @@ Legion::FutureMap
   size_t machine_view_hash = view->hash();
   /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): output_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[1](I/O): input_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(2, FID_DATA);
   if (elementwise_affine) {
     // regions[2](I): gamma
-    launcher.add_region_requirement(
-        RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
-                          EXCLUSIVE, weights[0]->region));
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
     launcher.add_field(3, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 13f2ae0a7a..e71be3bbf4 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -688,26 +688,43 @@ FutureMap Linear::peft_bwd(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(
-      weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE,
-      weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
   launcher.add_field(2, FID_DATA);
   if (use_bias) {
-    launcher.add_region_requirement(
-        RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY,
-                          EXCLUSIVE, weights[1]->region));
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
     launcher.add_field(3, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 050349ccb7..9ed411397d 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -577,17 +577,28 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
   MachineView const *view = mv ? mv : &output_tensor->machine_view;
   set_argumentmap_for_inference(ff, argmap, output_tensor);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index fe8f0094cb..c142e47e62 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -701,38 +701,57 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   int field_id = 0;
   // output_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad));
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // input grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   // residual grad 1
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
   if (use_two_residuals) {
     // residual grad 2
     launcher.add_region_requirement(
-        RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/,
-                          READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad));
+        RegionRequirement(batch_inputs[2]->part_grad,
+                          0 /*projection id*/,
+                          READ_WRITE,
+                          EXCLUSIVE,
+                          batch_inputs[2]->region_grad));
     launcher.add_field(field_id++, FID_DATA);
   }
   if (elementwise_affine) {
     // gamma
-    launcher.add_region_requirement(
-        RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
-                          EXCLUSIVE, weights[0]->region));
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 09e6327de7..28dd7e2745 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -630,29 +630,45 @@ Legion::FutureMap
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): RMS output_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad));
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[2](I/O): residual input grad 0
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   // regions[3](I/O): residual input grad 1
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad));
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
   launcher.add_field(2, FID_DATA);
   // regions[4](I): gamma
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
-                        EXCLUSIVE, weights[0]->region));
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
   launcher.add_field(3, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index b2d3d4521b..a1749d66af 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -527,24 +527,37 @@ Legion::FutureMap
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
-  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   // regions[0](I): output_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   // regions[1](I/O): input_grad
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   // regions[2](I): weight
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY,
-                        EXCLUSIVE, weights[0]->region));
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
   launcher.add_field(2, FID_DATA);
 
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index acca39ab33..c01f47aa21 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -373,25 +373,28 @@ FutureMap
                          machine_view_hash);
   launcher.add_future(bc);
   // output grad
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   // input 1 grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   // input 2 grad
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[1]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
   launcher.add_field(2, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index d852e09b46..23f2eb9edf 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -399,17 +399,28 @@ FutureMap Softmax::peft_bwd(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, parallel_is,
-                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 78ce807aa6..4478a2aedc 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -355,16 +355,26 @@ FutureMap AllReduce::peft_bwd(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID,
                          batch_outputs[0]->parallel_is,
-                         TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED,
-                         false /*must*/, 0 /*mapper_id*/, machine_view_hash);
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
   launcher.add_future(bc);
   launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad));
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/,
-                        READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad));
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 39d3ecdf81..4f7d0c9632 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -230,9 +230,10 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
           pt->part = runtime->get_logical_partition(
               ctx, pt->region, pt_base->part.get_index_partition());
 
-          pt->region_grad = runtime->create_logical_region(
-              ctx, pt_base->region.get_index_space(),
-              pt_base->region.get_field_space());
+          pt->region_grad =
+              runtime->create_logical_region(ctx,
+                                             pt_base->region.get_index_space(),
+                                             pt_base->region.get_field_space());
           pt->part_grad = runtime->get_logical_partition(
               ctx, pt->region_grad, pt_base->part.get_index_partition());
           pt->machine_view = machine_views[j];

From b985cc9ecf8c91ef09f5f2fe27da6274c7866af7 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 6 Dec 2023 00:30:07 -0500
Subject: [PATCH 117/198] enable peft request

---
 inference/incr_decoding/incr_decoding.cc | 14 +++++++-------
 src/ops/inc_multihead_self_attention.cu  | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index f1a51aa670..dcd1b5a5ab 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -292,13 +292,13 @@ void FlexFlow::top_level_task(Task const *task,
       requests.push_back(inference_req);
       total_num_requests++;
       // Add fine-tuning request
-      // Request fine_tuning_req;
-      // fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-      // fine_tuning_req.max_sequence_length = 128;
-      // fine_tuning_req.peft_model_id = peft_model_id;
-      // fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
-      // requests.push_back(fine_tuning_req);
-      // total_num_requests++;
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = 128;
+      fine_tuning_req.peft_model_id = peft_model_id;
+      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
+      requests.push_back(fine_tuning_req);
+      total_num_requests++;
     }
     GenerationResult result = model.generate(requests);
   }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index baa24b7c00..dec116addd 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1961,4 +1961,24 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
     GenericTensorAccessorR const weight,
     DataType data_type,
     cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float *output_ptr,
+    float const *weight_ptr,
+    float const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half *output_ptr,
+    half const *weight_ptr,
+    half const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
 }; // namespace FlexFlow

From b9c392631b596db788ead74fe76d08d80a487b7c Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Wed, 6 Dec 2023 09:31:37 -0500
Subject: [PATCH 118/198] several hacks for performance measurement; some of
 the changes should be reverted

---
 inference/incr_decoding/incr_decoding.cc | 32 ++++++++++++++++--------
 src/ops/argmax.cc                        |  5 ++++
 src/runtime/request_manager.cc           | 10 ++++++--
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index dcd1b5a5ab..94ccb1cabf 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -138,9 +138,9 @@ void FlexFlow::top_level_task(Task const *task,
   bool do_sample = false;
   float temperature = 0.0f;
   float topp = 0.0f;
-  int max_requests_per_batch = 8;
-  int max_tokens_per_batch = 128;
-  int max_sequence_length = 256;
+  int max_requests_per_batch = 2;
+  int max_tokens_per_batch = 300;
+  int max_sequence_length = 300;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -272,6 +272,7 @@ void FlexFlow::top_level_task(Task const *task,
 
   int total_num_requests = 0;
   {
+#ifdef DEADCODE
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
     assert(file_handle.good() && "Prompt file does not exist.");
@@ -291,15 +292,26 @@ void FlexFlow::top_level_task(Task const *task,
       inference_req.peft_model_id = peft_model_id;
       requests.push_back(inference_req);
       total_num_requests++;
-      // Add fine-tuning request
-      Request fine_tuning_req;
-      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = 128;
-      fine_tuning_req.peft_model_id = peft_model_id;
-      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
-      requests.push_back(fine_tuning_req);
+    }
+#endif
+    std::vector<Request> requests;
+    for (int i = 0; i < (max_requests_per_batch - 1) * 4; i++) {
+      Request inference_req;
+      inference_req.prompt = "b";
+      inference_req.max_sequence_length = 40;
+      requests.push_back(inference_req);
       total_num_requests++;
     }
+    // Add a fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+    fine_tuning_req.max_sequence_length = 256;
+    fine_tuning_req.max_training_steps = 256;
+    fine_tuning_req.peft_model_id = peft_model_id;
+    fine_tuning_req.dataset_text.push_back(std::make_pair("b", ""));
+    requests.push_back(fine_tuning_req);
+    total_num_requests++;
+
     GenerationResult result = model.generate(requests);
   }
 
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index cabb8b204f..dd0e2bb822 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -392,6 +392,11 @@ InferenceResult
   GenericTensorAccessorW parent;
   int batch_size = bc->num_active_infr_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
+  // Note that we free activation allocator here since argmax is the
+  // last operator in forward
+  if (m->handle.peft_activation_allocator != nullptr) {
+    m->handle.peft_activation_allocator->free_all();
+  }
   InferenceResult ir;
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cbb21e03e0..1d4a9ee47c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -246,13 +246,17 @@ RequestManager::RequestGuid
   request.peft_model_id = request_.peft_model_id;
   request.req_type = Request::REQ_FINETUNING;
   request.completed_training_steps = 0;
-  request.max_training_steps = 1; // TODO: let user set this
+  request.max_training_steps = request_.max_training_steps;
   for (auto const &sample : request_.dataset_text) {
     std::vector<int32_t> input_tokens;
     input_tokens = this->tokenizer_->Encode(sample.first);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
       input_tokens.insert(input_tokens.begin(), bos_token_id);
     }
+    // FIXME: this is a hack, must undo
+    while (input_tokens.size() < 256) {
+      input_tokens.push_back(293);
+    }
     std::vector<int32_t> output_tokens =
         this->tokenizer_->Encode(sample.second);
     if (input_tokens.size() + output_tokens.size() >
@@ -355,6 +359,7 @@ BatchConfig RequestManager::prepare_next_batch_task(
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
+  log_req_mgr.print("[Old BC] Num tokens: %d", old_bc.num_tokens);
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Step 1: append result from previous iteration to request's tokens
   for (int i = 0; i < old_bc.num_tokens; i++) {
@@ -539,7 +544,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   new_bc.num_generation_tokens = num_generation_tokens;
 
   // Step 3: add new requests to the next batch if there is space
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+  // FIXME: we reserve one slot for PEFT req now
+  for (int i = 0; i < BatchConfig::max_requests_per_batch() - 1; i++) {
     if (new_bc.request_completed[i]) {
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {

From 4d5c3e0797b4755cb8a572f2cc5985ffa33a6c57 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 16 Dec 2023 10:37:27 -0500
Subject: [PATCH 119/198] Update sigmoid_silu_multi.cu

---
 src/ops/sigmoid_silu_multi.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index ec88042a1d..e3b6f7a69a 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -233,7 +233,7 @@ void SigmoidSiluMulti::backward_kernel_wrapper(
                                                input1.get_float_ptr(),
                                                input2.get_float_ptr(),
                                                input1_grad.get_float_ptr(),
-                                               input1_grad.get_float_ptr(),
+                                               input2_grad.get_float_ptr(),
                                                m->reset_input_grads[0],
                                                m->reset_input_grads[1]);
   } else if (m->input_type[0] == DT_HALF) {

From 7bf863a15fc583c66f328dbe5f520b611860c212 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 18 Dec 2023 17:48:33 -0500
Subject: [PATCH 120/198] RoPE backward

---
 src/ops/inc_multihead_self_attention.cu | 62 ++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index dec116addd..452a8c09f6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -492,6 +492,46 @@ __global__ void
   }
 }
 
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_bwd(DT *input_ptr,
+                               cuFloatComplex *complex_input,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int proj_size,
+                               int num_tokens,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
+
+    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
+                             head_idx * num_tokens * proj_size +
+                             idx * num_tokens + token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
 template <typename DT>
 __global__ void fill_entries_above_diagonal(DT *matrix,
                                             size_t num_rows,
@@ -1166,7 +1206,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
       DT *C = static_cast<DT *>(m->devQKVProjArray);
       // after transposition & striding
-      // after transposition & striding
       int m_ = num_tokens; // num_new_tokens
       int n_ = m->qProjSize;
       int k_ = num_tokens;
@@ -1201,7 +1240,26 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
-    // Step 7: compute gradients w.r.t. input
+    // Step 7: perform rotary position embeddings (RoPE) bwd
+    {
+      if (*m->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
+                                     min(CUDA_NUM_THREADS, parallelism),
+                                     0,
+                                     stream>>>(A,
+                                               m->complex_input,
+                                               m->token_infos,
+                                               m->qProjSize,
+                                               num_tokens,
+                                               m->hidden_size);
+      }
+    }
+    // Step 8: compute gradients w.r.t. input
     {
       float alpha = 1.0f, beta = 0.0f;
       if (!m->reset_input_grads[0]) {

From 960654ed783fef09243eae666153947eaa1be404 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 9 Jan 2024 22:40:26 -0500
Subject: [PATCH 121/198] PEFT bug fixes and alignment (#1269)

* Revert "several hacks for performance measurement; some of the changes should be reverted"

This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c.

* backup

* backup

* updates

* update

* backup

* backup

* backup

* fix

* cleanup

* linting
---
 .../ops/add_bias_residual_layer_norm.h        |    2 -
 .../flexflow/ops/kernels/softmax_kernels.h    |    3 +-
 include/flexflow/ops/residual_layer_norm.h    |    1 +
 inference/incr_decoding/incr_decoding.cc      |   44 +-
 inference/models/opt.cc                       |   10 +-
 src/ops/add_bias_residual_layer_norm.cc       |   27 +-
 src/ops/add_bias_residual_layer_norm.cu       |   72 +-
 src/ops/fused.cc                              |   15 +
 src/ops/fused.cu                              |   78 +-
 src/ops/inc_multihead_self_attention.cc       |    4 +-
 src/ops/kernels/softmax.cu                    |   13 +-
 src/ops/layer_norm.cu                         |   18 -
 src/ops/linear.cc                             |    2 +-
 src/ops/lora_linear.cc                        |    4 +-
 src/ops/residual_layer_norm.cc                |   57 +-
 src/ops/residual_layer_norm.cu                |   76 +-
 src/ops/residual_rms_norm.cc                  |    7 +-
 src/ops/softmax.cc                            |   18 +-
 src/runtime/request_manager.cc                |   10 +-
 tests/peft/alignment_tests.ipynb              | 1427 +++++++++++++++++
 tests/peft/qk_prods_alignment.ipynb           |   24 +
 21 files changed, 1681 insertions(+), 231 deletions(-)
 create mode 100644 tests/peft/alignment_tests.ipynb
 create mode 100644 tests/peft/qk_prods_alignment.ipynb

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index 5c4a49f998..38bb825a4d 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -124,7 +124,6 @@ class AddBiasResidualLayerNorm : public Op {
                               T const *output_grad_ptr,
                               T *input_grad_ptr,
                               T *residual_grad_ptr,
-                              T *attn_bias_grad_ptr,
                               T const *gamma_ptr,
                               ffStream_t stream);
   static void
@@ -132,7 +131,6 @@ class AddBiasResidualLayerNorm : public Op {
                               GenericTensorAccessorR const &output_grad,
                               GenericTensorAccessorW &input_grad,
                               GenericTensorAccessorW const &residual_grad,
-                              GenericTensorAccessorW const &attn_bias_grad,
                               GenericTensorAccessorR const &gamma);
 
 public:
diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index db5e9799e9..b3dfe4f430 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -39,7 +39,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output);
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad);
 
 void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
                              BatchConfig const *bc,
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index 35ddb171d4..d924132452 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -28,6 +28,7 @@ class ResidualLayerNorm : public Op {
                     float _eps,
                     bool allocate_weights,
                     char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 94ccb1cabf..009cd1af45 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -138,9 +138,9 @@ void FlexFlow::top_level_task(Task const *task,
   bool do_sample = false;
   float temperature = 0.0f;
   float topp = 0.0f;
-  int max_requests_per_batch = 2;
-  int max_tokens_per_batch = 300;
-  int max_sequence_length = 300;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -272,7 +272,6 @@ void FlexFlow::top_level_task(Task const *task,
 
   int total_num_requests = 0;
   {
-#ifdef DEADCODE
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
     assert(file_handle.good() && "Prompt file does not exist.");
@@ -286,32 +285,21 @@ void FlexFlow::top_level_task(Task const *task,
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       // Add inference request
-      Request inference_req;
-      inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
-      inference_req.peft_model_id = peft_model_id;
-      requests.push_back(inference_req);
+      // Request inference_req;
+      // inference_req.prompt = text;
+      // inference_req.max_sequence_length = 128;
+      // inference_req.peft_model_id = peft_model_id;
+      // requests.push_back(inference_req);
+      // total_num_requests++;
+      // Add fine-tuning request
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = 128;
+      fine_tuning_req.peft_model_id = peft_model_id;
+      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
+      requests.push_back(fine_tuning_req);
       total_num_requests++;
     }
-#endif
-    std::vector<Request> requests;
-    for (int i = 0; i < (max_requests_per_batch - 1) * 4; i++) {
-      Request inference_req;
-      inference_req.prompt = "b";
-      inference_req.max_sequence_length = 40;
-      requests.push_back(inference_req);
-      total_num_requests++;
-    }
-    // Add a fine-tuning request
-    Request fine_tuning_req;
-    fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-    fine_tuning_req.max_sequence_length = 256;
-    fine_tuning_req.max_training_steps = 256;
-    fine_tuning_req.peft_model_id = peft_model_id;
-    fine_tuning_req.dataset_text.push_back(std::make_pair("b", ""));
-    requests.push_back(fine_tuning_req);
-    total_num_requests++;
-
     GenerationResult result = model.generate(requests);
   }
 
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 9069aef9e1..e0e940b186 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -193,7 +193,7 @@ void OPT::create_opt_model(FFModel &ff,
     Tensor fc1 =
         ff.dense(final_norm,
                  opt_config.ffn_dim,
-                 AC_MODE_NONE,
+                 AC_MODE_RELU,
                  true,
                  DT_NONE,
                  nullptr,
@@ -202,8 +202,7 @@ void OPT::create_opt_model(FFModel &ff,
                  REG_MODE_NONE,
                  0.0f,
                  std::string("layers_" + std::to_string(i) + "_fc1").c_str());
-    Tensor activation = ff.relu(fc1, false);
-    fc2 = ff.dense(activation,
+    fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
                    true,
@@ -216,7 +215,7 @@ void OPT::create_opt_model(FFModel &ff,
                    std::string("layers_" + std::to_string(i) + "_fc2").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
     ff.lora_linear(
-        activation,
+        fc1,
         fc2,
         OP_LORA_MLP_SECOND,
         std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str());
@@ -255,7 +254,8 @@ void OPT::create_opt_model(FFModel &ff,
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
-    output = ff.argmax(lm_head, /*beam_Search*/ false);
+    Tensor softmax = ff.softmax(lm_head, -1);
+    output = ff.argmax(softmax, /*beam_Search*/ false);
   }
 
   //------------------- compile the model --------------------------------
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index be7b357f23..88a34b7eb5 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -931,7 +931,7 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
@@ -939,25 +939,17 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
-  // attn bias grad
-  launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[2]->part_grad,
-                        0 /*projection id*/,
-                        READ_WRITE,
-                        EXCLUSIVE,
-                        batch_inputs[2]->region_grad));
-  launcher.add_field(field_id++, FID_DATA);
   if (elementwise_affine) {
     // gamma
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
-                                                      weights[0]->region));
+                                                      weights[1]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
@@ -1001,14 +993,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
                                        ctx,
                                        runtime);
 
-  GenericTensorAccessorW attn_bias_grad =
-      helperGetGenericTensorAccessorRW(m->weight_type[0],
-                                       regions[region_idx++],
-                                       task->regions[task_region_idx++],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
-
   GenericTensorAccessorR gamma;
   if (m->elementwise_affine) {
     assert(m->use_bias == (regions.size() == 6));
@@ -1020,13 +1004,12 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
                                              runtime);
   }
   AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
-      m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma);
+      m, output_grad, input_grad, residual_grad, gamma);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(attn_bias_grad);
     if (m->elementwise_affine) {
       weights_accessors.push_back(gamma);
     }
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 097ace3676..ab017ed46c 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -101,9 +101,9 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
+  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
             ? shared[lid]
-            : 0;
+            : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -536,8 +536,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
                                       T const *__restrict__ rstd,
                                       T const *__restrict__ gamma,
                                       T *dX,
-                                      T *dX_residual1,
-                                      T *dX_residual2,
+                                      T *dX_residual,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad,
                                       int const N,
                                       T *buf) {
   auto const i1 = blockIdx.x;
@@ -549,9 +550,7 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
   T const *X_i = X + i1 * N;
   T const *dY_i = dY + i1 * N;
   T *dX_i = dX + i1 * N;
-  T *dX_residual1_i = dX_residual1 + i1 * N;
-  T *dX_residual2_i =
-      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  T *dX_residual_i = dX_residual + i1 * N;
   // vectorized reads don't improve perf, so use regular unrolling
 
   for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
@@ -592,10 +591,15 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
     f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
     f_grad_input -= stats_x1;
     f_grad_input *= term1;
-    dX_i[l] += f_grad_input;
-    dX_residual1_i[l] += f_grad_input;
-    if (dX_residual2 != nullptr) {
-      dX_residual2_i[l] += f_grad_input;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad) {
+      dX_residual_i[l] = f_grad_input;
+    } else {
+      dX_residual_i[l] += f_grad_input;
     }
   }
 }
@@ -607,13 +611,24 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
                                              T const *__restrict__ rstd,
                                              T const *__restrict__ gamma,
                                              T *dX,
-                                             T *dX_residual1,
-                                             T *dX_residual2,
+                                             T *dX_residual,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad,
                                              int const N) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
   T *buf = reinterpret_cast<T *>(&s_data1);
 
-  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual,
+             reset_input_grad,
+             reset_residual_grad,
+             N,
+             buf);
 }
 
 /*static*/
@@ -661,7 +676,8 @@ void AddBiasResidualLayerNorm::backward_kernel(
       gamma_ptr,
       input_grad_ptr,
       residual_grad_ptr,
-      attn_bias_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
       N);
 
   if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
@@ -764,29 +780,11 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel(
     T const *output_grad_ptr,
     T *input_grad_ptr,
     T *residual_grad_ptr,
-    T *attn_bias_grad_ptr,
     T const *gamma_ptr,
     cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T const *>(m->input_activation),
-          gamma_ptr,
-          static_cast<T *>(m->ds_ptr),
-          static_cast<T *>(m->db_ptr));
-  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  ComputeGradientFusedParamsCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          static_cast<T *>(m->mean_ptr),
-                                          static_cast<T *>(m->rstd_ptr),
-                                          static_cast<T *>(m->ds_ptr),
-                                          static_cast<T *>(m->db_ptr),
-                                          static_cast<T *>(m->scale_ptr),
-                                          static_cast<T *>(m->bias_ptr));
+
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
@@ -799,7 +797,8 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel(
       gamma_ptr,
       input_grad_ptr,
       residual_grad_ptr,
-      attn_bias_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
       N);
 }
 
@@ -809,7 +808,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
     GenericTensorAccessorR const &output_grad,
     GenericTensorAccessorW &input_grad,
     GenericTensorAccessorW const &residual_grad,
-    GenericTensorAccessorW const &attn_bias_grad,
     GenericTensorAccessorR const &gamma) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -825,7 +823,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
                     output_grad.get_float_ptr(),
                     input_grad.get_float_ptr(),
                     residual_grad.get_float_ptr(),
-                    attn_bias_grad.get_float_ptr(),
                     m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
                     stream);
   } else if (m->output_type[0] == DT_HALF) {
@@ -833,7 +830,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
                     output_grad.get_half_ptr(),
                     input_grad.get_half_ptr(),
                     residual_grad.get_half_ptr(),
-                    attn_bias_grad.get_half_ptr(),
                     m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
                     stream);
   } else {
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index ea1c970cc5..8afd61aece 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -528,6 +528,21 @@ FutureMap FusedOp::inference(FFModel const &ff,
                           batch_outputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
+  offset += numOutputs;
+  // add softmax output grad
+  if (operators[numOperators - 1]->op_type == OP_SOFTMAX) {
+    printf("operator %i is last SOFTMAX! adding output %i\n",
+           numOperators - 1,
+           numOutputs - 1);
+    assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[numOutputs - 1]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[numOutputs - 1]->region_grad));
+    launcher.add_field(offset, FID_DATA);
+  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 9954a8b43a..f6bed71f6a 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -95,8 +95,11 @@ __host__ void
 
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs);
+  bool softmax_grad_additional_region =
+      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
+  assert((int)regions.size() == fused->numInputs + fused->numWeights +
+                                    fused->numOutputs +
+                                    softmax_grad_additional_region);
   // Domain input_domain[MAX_NUM_INPUTS];
   // Domain weight_domain[MAX_NUM_WEIGHTS];
   // Domain output_domain[MAX_NUM_OUTPUTS];
@@ -141,6 +144,7 @@ __host__ void
                                          ctx,
                                          runtime);
   }
+  roff += fused->numOutputs;
   // Assert that all meta share the same dnn/blas handler
   int start = 0;
   for (start = 0; start < fused->numOperators; start++) {
@@ -625,9 +629,22 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain.get_volume() ==
                my_output_accessor[0].domain.get_volume());
+        if (op == fused->numOperators - 1) { // if this is the final operator
+          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
+              fused->output_data_types[fused->numOutputs - 1],
+              regions[roff],
+              task->regions[roff],
+              FID_DATA,
+              ctx,
+              runtime);
+        }
         SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
         Kernels::Softmax::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+            m,
+            bc,
+            my_input_accessor[0],
+            my_output_accessor[0],
+            output_accessor[fused->numOutputs]);
         break;
       }
       case OP_ALLREDUCE: {
@@ -1008,7 +1025,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
             m,
             bc,
-            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
             my_input_grad_accessor[0],
             my_input_grad_accessor[1],
             my_weight_accessor[0]);
@@ -1078,27 +1095,20 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
             assert(fused->op_num_weights[op] == 2); // weight + bias
           }
         }
-        GenericTensorAccessorR residual2;
+        GenericTensorAccessorW residual2;
         if (m->use_two_residuals) {
           residual2 = my_input_grad_accessor[2];
         }
-        GenericTensorAccessorR gamma, beta;
+        GenericTensorAccessorR gamma;
         if (m->elementwise_affine) {
           gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
-          }
         }
-        // TODO: implment me
-        assert(false);
-        // ResidualLayerNorm::inference_kernel_wrapper(m,
-        //                                             my_input_accessor[0],
-        //                                             my_input_accessor[1],
-        //                                             residual2,
-        //                                             my_output_accessor[0],
-        //                                             my_output_accessor[1],
-        //                                             gamma,
-        //                                             beta);
+        ResidualLayerNorm::peft_bwd_kernel_wrapper(m,
+                                                   my_output_grad_accessor[1],
+                                                   my_input_grad_accessor[0],
+                                                   my_input_grad_accessor[1],
+                                                   residual2,
+                                                   gamma);
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
@@ -1115,31 +1125,17 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
             assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
           }
         }
-        GenericTensorAccessorR gamma, beta;
+        GenericTensorAccessorR gamma;
         if (m->elementwise_affine) {
           gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
-          }
         }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_grad_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        // TODO: implement me
-        assert(false);
-        // AddBiasResidualLayerNorm::inference_kernel_wrapper(
-        //     m,
-        //     attn_bias_dim,
-        //     residual_volume,
-        //     my_input_accessor[0],
-        //     my_output_accessor[0],
-        //     my_output_accessor[1],
-        //     my_input_accessor[1],
-        //     my_weight_accessor[0],
-        //     gamma,
-        //     beta);
+
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[1],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            gamma);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index ca6eb7c095..5d52034575 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -902,7 +902,7 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
@@ -964,7 +964,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
       m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorR biases;
   if (*m->qkv_bias || *m->final_bias) {
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 0fc827319d..271a291b09 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -121,7 +121,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -138,6 +139,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_float_ptr(),
                                num_classes,
                                stream);
+    checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(),
+                              output.get_float_ptr(),
+                              output.domain.get_volume() * sizeof(float),
+                              cudaMemcpyDeviceToDevice,
+                              stream));
   } else if (m->output_type[0] == DT_HALF) {
     Internal::inference_kernel(m,
                                bc,
@@ -145,6 +151,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_half_ptr(),
                                num_classes,
                                stream);
+    checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(),
+                              output.get_half_ptr(),
+                              output.domain.get_volume() * sizeof(half),
+                              cudaMemcpyDeviceToDevice,
+                              stream));
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 6e12c53230..1d4e94d7d5 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -664,24 +664,6 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
                                 cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T *>(m->input_activation),
-          gamma_ptr,
-          static_cast<T *>(m->ds_ptr),
-          static_cast<T *>(m->db_ptr));
-  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  ComputeGradientFusedParamsCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          static_cast<T *>(m->mean_ptr),
-                                          static_cast<T *>(m->rstd_ptr),
-                                          static_cast<T *>(m->ds_ptr),
-                                          static_cast<T *>(m->db_ptr),
-                                          static_cast<T *>(m->scale_ptr),
-                                          static_cast<T *>(m->bias_ptr));
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index e71be3bbf4..15789ae2e9 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -700,7 +700,7 @@ FutureMap Linear::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 9ed411397d..e39b444af4 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -589,14 +589,14 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(1, FID_DATA);
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index c142e47e62..8563c299ab 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -117,7 +117,6 @@ void FFModel::residual_layer_norm(const Tensor input,
   }
 
   int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0;
-  Layer *ln = nullptr;
   Tensor casted_input =
       (data_type != input->data_type)
           ? cast(input, data_type, "type cast for residual_layer_norm")
@@ -133,20 +132,20 @@ void FFModel::residual_layer_norm(const Tensor input,
             ? cast(residual2, data_type, "type cast for residual2_layer_norm")
             : residual2;
   }
-  ln = new Layer(this,
-                 OP_RESIDUAL_LAYERNORM,
-                 data_type,
-                 name,
-                 2 + use_two_residuals /*inputs*/,
-                 num_weights,
-                 2 /*outputs*/,
-                 casted_input,
-                 casted_residual1,
-                 casted_residual2);
+  Layer *ln = new Layer(this,
+                        OP_RESIDUAL_LAYERNORM,
+                        data_type,
+                        name,
+                        2 + use_two_residuals /*inputs*/,
+                        num_weights,
+                        2 /*outputs*/,
+                        casted_input,
+                        casted_residual1,
+                        casted_residual2);
   ln->outputs[0] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/);
   ln->outputs[1] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/);
   {
     int numdims = axes.size();
     int dims[numdims];
@@ -326,6 +325,18 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
   }
 }
 
+void ResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  outputs[0]->parallel_is = inputs[0]->parallel_is;
+  outputs[0]->region = inputs[0]->region;
+  outputs[0]->part = inputs[0]->part;
+  outputs[0]->region_grad = inputs[0]->region_grad;
+  outputs[0]->part_grad = inputs[0]->part_grad;
+  // map output 1 to new region
+  ff.map_tensor(outputs[1], this);
+}
+
 void ResidualLayerNorm::init_inference(
     FFModel const &ff,
     std::vector<ParallelTensor> const &batch_inputs,
@@ -439,11 +450,11 @@ void ResidualLayerNorm::init(FFModel const &ff) {
   launcher.add_field(field_id++, FID_DATA);
   // residual2
   if (use_two_residuals) {
-    launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
-                                                      inputs[1]->region));
+                                                      inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   // added: input + residual(s)
@@ -723,7 +734,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
@@ -731,7 +742,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
@@ -740,7 +751,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
     launcher.add_region_requirement(
         RegionRequirement(batch_inputs[2]->part_grad,
                           0 /*projection id*/,
-                          READ_WRITE,
+                          reset_input_grads[2] ? WRITE_ONLY : READ_WRITE,
                           EXCLUSIVE,
                           batch_inputs[2]->region_grad));
     launcher.add_field(field_id++, FID_DATA);
@@ -768,9 +779,7 @@ void ResidualLayerNorm::peft_bwd_task(
   }
   assert(task->regions.size() == regions.size());
   ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-  assert(regions.size() ==
-         4 + m->use_two_residuals +
-             (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+  assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine);
 
   int region_idx = 0, task_region_idx = 0;
 
@@ -807,8 +816,7 @@ void ResidualLayerNorm::peft_bwd_task(
   }
   GenericTensorAccessorR gamma;
   if (m->elementwise_affine) {
-    assert(m->use_bias == (regions.size() == 6));
-    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+    gamma = helperGetGenericTensorAccessorRO(m->weight_type[0],
                                              regions[region_idx++],
                                              task->regions[task_region_idx++],
                                              FID_DATA,
@@ -942,12 +950,11 @@ void ResidualLayerNorm::inference_task(
 
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
   if (bc->num_tokens == 0) {
     return;
   }
 
-  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-
   assert(regions.size() ==
          4 + m->use_two_residuals +
              (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 4bfac1887f..1f87949234 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -239,20 +239,17 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     }
     assert(num_peft_requests <= 1);
 
-    int tokens_previous_requests = 0;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
       // Skip non-PEFT requests
       if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-        // FIXME: use the new approach to computing token offset
-        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int in_dim =
-          added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1;
+      int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
         m->input_activation = allocator->allocate_instance_untyped(
@@ -261,14 +258,14 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              added_output.get_float_ptr() + tokens_previous_requests * in_dim,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
         } else if (m->input_type[0] == DT_HALF) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              added_output.get_half_ptr() + tokens_previous_requests * in_dim,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
@@ -481,6 +478,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
                                       T *dX,
                                       T *dX_residual1,
                                       T *dX_residual2,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad1,
+                                      bool reset_residual_grad2,
                                       int const N,
                                       T *buf) {
   auto const i1 = blockIdx.x;
@@ -535,10 +535,22 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
     f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
     f_grad_input -= stats_x1;
     f_grad_input *= term1;
-    dX_i[l] += f_grad_input;
-    dX_residual1_i[l] += f_grad_input;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad1) {
+      dX_residual1_i[l] = f_grad_input;
+    } else {
+      dX_residual1_i[l] += f_grad_input;
+    }
     if (dX_residual2 != nullptr) {
-      dX_residual2_i[l] += f_grad_input;
+      if (reset_residual_grad2) {
+        dX_residual2_i[l] = f_grad_input;
+      } else {
+        dX_residual2_i[l] += f_grad_input;
+      }
     }
   }
 }
@@ -552,11 +564,25 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
                                              T *dX,
                                              T *dX_residual1,
                                              T *dX_residual2,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad1,
+                                             bool reset_residual_grad2,
                                              int const N) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
   T *buf = reinterpret_cast<T *>(&s_data1);
-
-  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual1,
+             dX_residual2,
+             reset_input_grad,
+             reset_residual_grad1,
+             reset_residual_grad2,
+             N,
+             buf);
 }
 
 /*static*/
@@ -604,6 +630,9 @@ void backward_kernel(ResidualLayerNormMeta const *m,
       input_grad_ptr,
       residual1_grad_ptr,
       residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
       N);
 
   if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
@@ -710,28 +739,12 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
                      cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T const *>(m->input_activation),
-          gamma_ptr,
-          static_cast<T *>(m->ds_ptr),
-          static_cast<T *>(m->db_ptr));
-  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  ComputeGradientFusedParamsCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          static_cast<T *>(m->mean_ptr),
-                                          static_cast<T *>(m->rstd_ptr),
-                                          static_cast<T *>(m->ds_ptr),
-                                          static_cast<T *>(m->db_ptr),
-                                          static_cast<T *>(m->scale_ptr),
-                                          static_cast<T *>(m->bias_ptr));
+
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
   int nshared = (num_threads / warp_size) * sizeof(T);
+
   layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
       output_grad_ptr,
       static_cast<T const *>(m->input_activation),
@@ -741,6 +754,9 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
       input_grad_ptr,
       residual1_grad_ptr,
       residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
       N);
 }
 
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 28dd7e2745..c2fbe11544 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -90,9 +90,9 @@ void FFModel::residual_rms_norm(const Tensor input1,
                         casted_input2);
 
   rm->outputs[0] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/);
   rm->outputs[1] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/);
 
   // weights
   int weight_dims[1] = {dim};
@@ -100,7 +100,7 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                                  weight_dims,
                                                  data_type,
                                                  rm,
-                                                 true /*create_grad*/,
+                                                 false /*create_grad*/,
                                                  nullptr,
                                                  CHOSEN_SYNC_TYPE);
 
@@ -710,6 +710,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(
       m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 23f2eb9edf..1d062b552b 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -355,6 +355,14 @@ FutureMap Softmax::inference(FFModel const &ff,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
+  // we add the region below in order to copy the output to the grad tensor
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -363,8 +371,8 @@ void Softmax::inference_task(Task const *task,
                              Context ctx,
                              Runtime *runtime) {
   assert(task->regions.size() == regions.size());
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
@@ -376,7 +384,9 @@ void Softmax::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  inference_kernel_wrapper(m, bc, input, output);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  inference_kernel_wrapper(m, bc, input, output, output_grad);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -411,7 +421,7 @@ FutureMap Softmax::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1d4a9ee47c..cbb21e03e0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -246,17 +246,13 @@ RequestManager::RequestGuid
   request.peft_model_id = request_.peft_model_id;
   request.req_type = Request::REQ_FINETUNING;
   request.completed_training_steps = 0;
-  request.max_training_steps = request_.max_training_steps;
+  request.max_training_steps = 1; // TODO: let user set this
   for (auto const &sample : request_.dataset_text) {
     std::vector<int32_t> input_tokens;
     input_tokens = this->tokenizer_->Encode(sample.first);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
       input_tokens.insert(input_tokens.begin(), bos_token_id);
     }
-    // FIXME: this is a hack, must undo
-    while (input_tokens.size() < 256) {
-      input_tokens.push_back(293);
-    }
     std::vector<int32_t> output_tokens =
         this->tokenizer_->Encode(sample.second);
     if (input_tokens.size() + output_tokens.size() >
@@ -359,7 +355,6 @@ BatchConfig RequestManager::prepare_next_batch_task(
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
-  log_req_mgr.print("[Old BC] Num tokens: %d", old_bc.num_tokens);
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Step 1: append result from previous iteration to request's tokens
   for (int i = 0; i < old_bc.num_tokens; i++) {
@@ -544,8 +539,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   new_bc.num_generation_tokens = num_generation_tokens;
 
   // Step 3: add new requests to the next batch if there is space
-  // FIXME: we reserve one slot for PEFT req now
-  for (int i = 0; i < BatchConfig::max_requests_per_batch() - 1; i++) {
+  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
new file mode 100644
index 0000000000..e2a8978ea3
--- /dev/null
+++ b/tests/peft/alignment_tests.ipynb
@@ -0,0 +1,1427 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors\"\n",
+    "ff_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors\"\n",
+    "def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):\n",
+    "    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n",
+    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
+    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
+    "        assert(len(hf_tensor) == 1)\n",
+    "        hf_tensor = hf_tensor[0]\n",
+    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
+    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
+    "    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "\n",
+    "    len_hf_tensor = hf_tensor.shape[0]\n",
+    "    ff_tensor = ff_tensor[:len_hf_tensor]\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n",
+    "        print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}\")\n",
+    "        print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n",
+    "        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "        #print(np.nonzero(hf_tensor)[0])\n",
+    "        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n",
+    "        # print(ff_tensor[36], hf_tensor[36])\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len_hf_tensor)\n",
+    "    print(\"Ok!\")\n",
+    "def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):\n",
+    "    assert(os.path.exists(hf_tensor_filepath))\n",
+    "    assert(os.path.exists(ff_tensor1_filepath))\n",
+    "    assert(os.path.exists(ff_tensor2_filepath))\n",
+    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
+    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
+    "        assert(len(hf_tensor) == 1)\n",
+    "        hf_tensor = hf_tensor[0]\n",
+    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
+    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')\n",
+    "\n",
+    "    len_hf_tensor = hf_tensor.shape[0]\n",
+    "    ff_tensor1 = ff_tensor1[:len_hf_tensor]\n",
+    "    ff_tensor2 = ff_tensor2[:len_hf_tensor]\n",
+    "    ff_tensor = ff_tensor1 - ff_tensor2\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n",
+    "        print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}\")\n",
+    "        print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n",
+    "        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "        #print(np.nonzero(hf_tensor)[0])\n",
+    "        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n",
+    "        # print(ff_tensor[36], hf_tensor[36])\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len_hf_tensor)\n",
+    "    print(\"Ok!\")\n",
+    "def compare_hf_tensors(tensor1_fp, tensor2_fp):\n",
+    "    assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n",
+    "    hf_tensor1 = torch.load(tensor1_fp)\n",
+    "    hf_tensor2 = torch.load(tensor2_fp)\n",
+    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
+    "        assert(len(hf_tensor1) == 1)\n",
+    "        hf_tensor1 = hf_tensor1[0]\n",
+    "    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n",
+    "        assert(len(hf_tensor2) == 1)\n",
+    "        hf_tensor2 = hf_tensor2[0]\n",
+    "    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n",
+    "    hf_tensor1 = torch.nan_to_num(hf_tensor1)\n",
+    "    hf_tensor2 = torch.nan_to_num(hf_tensor2)\n",
+    "    if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):\n",
+    "        print(f\"mismatch between {tensor1_fp} and {tensor2_fp}\")\n",
+    "        print(hf_tensor1)\n",
+    "        print(hf_tensor2)\n",
+    "        print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))\n",
+    "        mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]\n",
+    "        print(mismatches)\n",
+    "        assert(False)\n",
+    "    print(\"Ok!\")\n",
+    "\n",
+    "def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):\n",
+    "    assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n",
+    "    hf_tensor_sum = torch.load(tensor_sum_fp)\n",
+    "    hf_tensor1 = torch.load(tensor1_fp)\n",
+    "    hf_tensor2 = torch.load(tensor2_fp)\n",
+    "    if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:\n",
+    "        assert(len(hf_tensor_sum) == 1)\n",
+    "        hf_tensor_sum = hf_tensor_sum[0]\n",
+    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
+    "        assert(len(hf_tensor1) == 1)\n",
+    "        hf_tensor1 = hf_tensor1[0]\n",
+    "    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n",
+    "        assert(len(hf_tensor2) == 1)\n",
+    "        hf_tensor2 = hf_tensor2[0]\n",
+    "    assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)\n",
+    "    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n",
+    "    hf_tensor1 = torch.nan_to_num(hf_tensor1)\n",
+    "    hf_tensor2 = torch.nan_to_num(hf_tensor2)\n",
+    "    hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)\n",
+    "    sum_check_tensor = hf_tensor1 + hf_tensor2\n",
+    "    if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):\n",
+    "        print(f\"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}\")\n",
+    "        print(tensor_sum_fp)\n",
+    "        print(sum_check_tensor)\n",
+    "        print(hf_tensor1)\n",
+    "        print(hf_tensor2)\n",
+    "        print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))\n",
+    "        mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]\n",
+    "        print(mismatches)\n",
+    "        assert(False)\n",
+    "    print(\"Ok!\")\n",
+    "def check_hf_zero_tensor(hf_tensor_fp):\n",
+    "    assert(os.path.exists(hf_tensor_fp))\n",
+    "    hf_tensor1 = torch.load(hf_tensor_fp)\n",
+    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
+    "        assert(len(hf_tensor1) == 1)\n",
+    "        hf_tensor1 = hf_tensor1[0]\n",
+    "    assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)\n",
+    "def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=\"\"):\n",
+    "    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n",
+    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
+    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
+    "        assert(len(hf_tensor) == 1)\n",
+    "        hf_tensor = hf_tensor[0]\n",
+    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
+    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
+    "    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "\n",
+    "    len_hf_tensor = hf_tensor.shape[0]\n",
+    "    ff_tensor = ff_tensor[:len_hf_tensor]\n",
+    "\n",
+    "    print(f\"{txt} - HF tensor:\")\n",
+    "    print(hf_tensor)\n",
+    "    print(f\"{txt} - FF tensor: \")\n",
+    "    print(ff_tensor)\n",
+    "def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):\n",
+    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
+    "\n",
+    "    if (ff_tensor1.shape != ff_tensor2.shape):\n",
+    "        print(ff_tensor1.shape, ff_tensor2.shape)\n",
+    "    assert(ff_tensor1.shape == ff_tensor2.shape)\n",
+    "\n",
+    "    if max_len > -1:\n",
+    "        ff_tensor1 = ff_tensor1[:max_len]\n",
+    "        ff_tensor2 = ff_tensor2[:max_len]\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n",
+    "        print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n",
+    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
+    "        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
+    "    print(\"Ok!\")\n",
+    "def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n",
+    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
+    "    minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])\n",
+    "    ff_tensor1 = ff_tensor1[:minlen]\n",
+    "    ff_tensor2 = ff_tensor2[:minlen]\n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n",
+    "        print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n",
+    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
+    "        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
+    "    print(\"Ok!\")\n",
+    "def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n",
+    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
+    "    ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')\n",
+    "    \n",
+    "    ff_sum = ff_tensor1 + ff_tensor2\n",
+    "    assert(ff_tensor1.shape == ff_tensor2.shape)\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):\n",
+    "        print(f\"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}\")\n",
+    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
+    "        print(f\"Sum Tensor: {ff_tensor_sum}\\nActual sum:{ff_sum}\")\n",
+    "        print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
+    "    print(\"Ok!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers):\n",
+    "    hf_input_ln_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.input_layernorm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_RMSNorm_shard-id_0_output_0\"\n",
+    "    if layer_num > 0:\n",
+    "        ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "    hf_attn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.o_proj.output_0\"\n",
+    "    ff_attn_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_attn_out, ff_attn_out)\n",
+    "    hf_ffn_norm_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.post_attention_layernorm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    # w1\n",
+    "    hf_gate_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n",
+    "    ff_gate_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n",
+    "    # w3\n",
+    "    hf_up_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\" \n",
+    "    ff_up_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_up_proj_out, ff_up_proj_out)\n",
+    "    # w2\n",
+    "    hf_down_proj_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.input_0\"\n",
+    "    hf_down_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.output_0\"\n",
+    "    ff_down_proj_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_weight_base_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "hf_lm_head_out = f\"{hf_weight_base_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "ff_BWD_softmax_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "\n",
+    "hf_BWD_lm_head_out = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "# compare weights\n",
+    "hf_lm_head_weight = f\"{hf_weight_base_path}/base_model.model.lm_head.weight\"\n",
+    "ff_lm_head_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_weight_0\"\n",
+    "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "# # Manually check the matmul\n",
+    "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n",
+    "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n",
+    "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n",
+    "# print(ff_tensor_out.shape)\n",
+    "# print(ff_weight.shape)\n",
+    "# print(np.matmul(ff_weight, ff_tensor_out))\n",
+    "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n",
+    "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "\n",
+    "hf_BWD_norm_out = f\"{hf_weight_base_path}/bwd_step_0_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "ff_BWD_norm_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_weight_0\"\n",
+    "hf_FWD_norm_weight = f\"{hf_weight_base_path}/base_model.model.model.norm.weight\"\n",
+    "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
+    "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "class LlamaRotaryEmbedding(nn.Module):\n",
+    "    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.dim = dim\n",
+    "        self.max_position_embeddings = max_position_embeddings\n",
+    "        self.base = base\n",
+    "        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n",
+    "        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n",
+    "\n",
+    "        # Build here to make `torch.jit.trace` work.\n",
+    "        self._set_cos_sin_cache(\n",
+    "            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n",
+    "        )\n",
+    "\n",
+    "    def _set_cos_sin_cache(self, seq_len, device, dtype):\n",
+    "        self.max_seq_len_cached = seq_len\n",
+    "        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n",
+    "\n",
+    "        freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n",
+    "        # Different from paper, but it uses a different permutation in order to obtain the same calculation\n",
+    "        emb = torch.cat((freqs, freqs), dim=-1)\n",
+    "        self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n",
+    "        self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n",
+    "\n",
+    "    def forward(self, x, seq_len=None):\n",
+    "        # x: [bs, num_attention_heads, seq_len, head_size]\n",
+    "        if seq_len > self.max_seq_len_cached:\n",
+    "            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n",
+    "\n",
+    "        return (\n",
+    "            self.cos_cached[:seq_len].to(dtype=x.dtype),\n",
+    "            self.sin_cached[:seq_len].to(dtype=x.dtype),\n",
+    "        )\n",
+    "def rotate_half(x):\n",
+    "    \"\"\"Rotates half the hidden dims of the input.\"\"\"\n",
+    "    x1 = x[..., : x.shape[-1] // 2] # first half\n",
+    "    x2 = x[..., x.shape[-1] // 2 :] # second half\n",
+    "    return torch.cat((x2, -x1), dim=-1)\n",
+    "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n",
+    "    \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n",
+    "\n",
+    "    Args:\n",
+    "        q (`torch.Tensor`): The query tensor.\n",
+    "        k (`torch.Tensor`): The key tensor.\n",
+    "        cos (`torch.Tensor`): The cosine part of the rotary embedding.\n",
+    "        sin (`torch.Tensor`): The sine part of the rotary embedding.\n",
+    "        position_ids (`torch.Tensor`):\n",
+    "            The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n",
+    "            used to pass offsetted position ids when working with a KV-cache.\n",
+    "        unsqueeze_dim (`int`, *optional*, defaults to 1):\n",
+    "            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n",
+    "            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n",
+    "            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n",
+    "            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n",
+    "            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n",
+    "            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n",
+    "    Returns:\n",
+    "        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n",
+    "    \"\"\"\n",
+    "    cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    q_embed = (q * cos) + (rotate_half(q) * sin)\n",
+    "    k_embed = (k * cos) + (rotate_half(k) * sin)\n",
+    "    return q_embed, k_embed\n",
+    "head_dim = 64\n",
+    "max_position_embeddings = 2048\n",
+    "rope_theta=10_000\n",
+    "kv_seq_len = 24\n",
+    "rotary_emb = LlamaRotaryEmbedding(\n",
+    "    head_dim,\n",
+    "    max_position_embeddings=max_position_embeddings,\n",
+    "    base=rope_theta,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  2.14103374e+01\n",
+      "  1.20965424e+01  3.64246750e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[2394]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  2.14103374e+01\n",
+      "  1.20965424e+01  3.64246750e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[2394]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "4.383680555555555% mismatch in QK prods softmax out grad\n",
+      "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n",
+      "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n",
+      "4.817708333333334% mismatch in attention input grads\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 353\u001b[0m\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers-1, -1, -1):\n",
+    "    # HuggingFace filepaths\n",
+    "    hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
+    "    hf_BWD_loraB_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.go_0\"\n",
+    "    hf_BWD_loraB_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.gi_0\"\n",
+    "    hf_BWD_loraA_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.go_0\"\n",
+    "    hf_BWD_loraA_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.gi_0\"\n",
+    "    hf_loraA_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    hf_loraB_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    hf_BWD_lora_dropout_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.go_0\"\n",
+    "    hf_BWD_lora_dropout_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.gi_0\"\n",
+    "    hf_BWD_w2_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.go_0\"\n",
+    "    hf_BWD_w2_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.gi_0\"\n",
+    "    hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    hf_BWD_w3_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.go_0\"\n",
+    "    hf_BWD_w3_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.gi_0\"\n",
+    "    hf_BWD_w1_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.go_0\"\n",
+    "    hf_BWD_w1_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.gi_0\"\n",
+    "    hf_BWD_act_fn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.gi_0\"\n",
+    "    hf_BWD_act_fn_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.go_0\"\n",
+    "    hf_BWD_ffn_norm_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.go_0\"\n",
+    "    hf_BWD_ffn_norm_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.gi_0\"\n",
+    "    hf_BWD_attn_out_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.go_0\"\n",
+    "    hf_BWD_attn_q_in = f\"{hf_weight_base_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n",
+    "    hf_FWD_w1_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n",
+    "    hf_FWD_w3_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\"\n",
+    "    hf_FWD_act_fn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.act_fn.output_0\"\n",
+    "    hf_BWD_attn_oproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "    hf_attn_qproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.q_proj.weight\"\n",
+    "    hf_attn_kproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.k_proj.weight\"\n",
+    "    hf_attn_vproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.v_proj.weight\"\n",
+    "    hf_attn_oproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.o_proj.weight\"\n",
+    "    # hf_BWD_attn_vproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "    # FlexFlow filepaths\n",
+    "    ff_BWD_w2_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n",
+    "    ff_BWD_w2_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n",
+    "    ff_BWD_w2_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_pre_input_0\"\n",
+    "    ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    ff_BWD_ssm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_output_0\"\n",
+    "    ff_BWD_ssm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_0\"\n",
+    "    ff_BWD_ssm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_1\"\n",
+    "    ff_BWD_w3_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
+    "    ff_BWD_w3_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_input_0\"\n",
+    "    ff_BWD_lora_A_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    ff_BWD_lora_B_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    ff_lora_A_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    ff_lora_B_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    ff_BWD_w1_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
+    "    ff_BWD_w1_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_input_0\"\n",
+    "    ff_BWD_w1_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_pre_input_0\"\n",
+    "    ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    ff_BWD_ffn_norm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_0\"\n",
+    "    ff_BWD_ffn_norm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_1\"\n",
+    "    ff_BWD_ffn_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_0\"\n",
+    "    ff_BWD_attn_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n",
+    "    ff_BWD_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_input_0\"\n",
+    "    ff_BWD_ssm_cached_w1_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w1_output\"\n",
+    "    ff_BWD_ssm_cached_w3_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w3_output\"\n",
+    "    ff_FWD_w1_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
+    "    ff_FWD_w3_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
+    "    ff_FWD_act_fnc_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_act_fn_output\"\n",
+    "    ff_BWD_attn_o_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
+    "    # ff_BWD_attn_v_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n",
+    "    ff_attn_oproj_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_weight_0\"\n",
+    "    # ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "    # xxx = torch.load(hf_BWD_attn_out_out)\n",
+    "    # xxx.detach().cpu().numpy().tofile(f\"{hf_BWD_attn_out_out}.flexflow\")\n",
+    "    # print(f\"{hf_BWD_attn_out_out}.flexflow\")\n",
+    "    \n",
+    "    # HuggingFace checks\n",
+    "    print(\"\\nHuggingface checks:\")\n",
+    "    if layer_num == tot_num_layers-1:\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
+    "    # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n",
+    "    check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n",
+    "    check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n",
+    "\n",
+    "    # FlexFlow checks\n",
+    "    print(\"\\nFlexFlow checks:\")\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n",
+    "    #compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768) # should fail\n",
+    "\n",
+    "    # HF-FlexFlow checks\n",
+    "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
+    "    compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    #print(torch.load(hf_w2_weight).shape)\n",
+    "    compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n",
+    "    compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n",
+    "    compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n",
+    "\n",
+    "    compare_tensors(hf_FWD_w1_out, ff_FWD_w1_out)\n",
+    "    compare_tensors(hf_FWD_w3_out, ff_FWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    # compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n",
+    "    # compare_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2)\n",
+    "    # compare_tensors(hf_BWD_attn_out_out, ff_BWD_ffn_norm_in2)\n",
+    "    compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n",
+    "\n",
+    "    # compare attn weight tensors\n",
+    "    hidden_size = 768\n",
+    "    qProjSize = 64\n",
+    "    num_heads = 12\n",
+    "    num_new_tokens = num_tokens = 24\n",
+    "    ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n",
+    "    ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "    ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "    ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "    ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n",
+    "    \n",
+    "    hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n",
+    "    hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n",
+    "    hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n",
+    "    hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n",
+    "    \n",
+    "    assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n",
+    "    assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n",
+    "    assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n",
+    "    assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n",
+    "    \n",
+    "    # Compare attn outproj grad in tensors\n",
+    "    compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n",
+    "    \n",
+    "    ########### Compare value projs grads ######################\n",
+    "    # 1. compare qk prods softmax\n",
+    "    hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.qk_prods_softmax\"\n",
+    "    ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "    \n",
+    "    hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n",
+    "    ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "\n",
+    "    for head_idx in range(num_heads):\n",
+    "        hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "        ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "        assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "    \n",
+    "    # 2. compare attn heads grads\n",
+    "    hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "    ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
+    "\n",
+    "    hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n",
+    "    ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n",
+    "    assert(np.allclose(ff_attn_heads_grads, hf_attn_heads_grads, atol=1e-2))\n",
+    "\n",
+    "    # 3. vproj grads\n",
+    "    hf_vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.go_0\"\n",
+    "    ff_vproj_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n",
+    "\n",
+    "    hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n",
+    "    ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n",
+    "    assert(np.allclose(hf_vproj_grads, ff_vproj_grads, atol=1e-2))\n",
+    "\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    ##############################\n",
+    "    hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "    hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "    # print(hf_value_states.shape)\n",
+    "    ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n",
+    "    ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
+    "    # print(ff_value_states.shape)\n",
+    "    assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    ########## Compare key and query projs grads ##################\n",
+    "    ff_devQKVPRojArray = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
+    "    ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n",
+    "    ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n",
+    "    ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n",
+    "    ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n",
+    "    assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n",
+    "\n",
+    "    # simulate qk_prods_softmax\n",
+    "    ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
+    "    ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n",
+    "    ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n",
+    "    ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
+    "    ff_value_states = torch.from_numpy(ff_value_states)\n",
+    "    ff_value_states = ff_value_states.permute(1,0,2)\n",
+    "    # print(ff_attn_heads_grads.shape)\n",
+    "    # print(ff_value_states.shape)\n",
+    "    simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
+    "    #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
+    "    #print(\"Simulated QK prods grads:\")\n",
+    "    #print(simulated_qk_prods_softmax_grads[0,:,:])\n",
+    "\n",
+    "    # qk prods softmax right before softmax\n",
+    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.go_0\"\n",
+    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad\"\n",
+    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "    # assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "    mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n",
+    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "\n",
+    "    # qk prods softmax right after softmax\n",
+    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.gi_0\"\n",
+    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in\"\n",
+    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "    assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "    \n",
+    "    # qk prods softmax after mask\n",
+    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.matmul_op.go_0\"\n",
+    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in_masked\"\n",
+    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "    assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "\n",
+    "    # Compare query activation\n",
+    "    hf_query_activation = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.query_activation\"\n",
+    "    hf_query_activation = torch.load(hf_query_activation)\n",
+    "    ff_query_activation = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_query_activation\"\n",
+    "    ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n",
+    "    hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "    assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
+    "    \n",
+    "    ########################################## ROPE and Kproj ##########################################\n",
+    "\n",
+    "    # Compare FF kproj with intermediate kproj data from HF\n",
+    "    hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n",
+    "    hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
+    "    hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
+    "    # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
+    "    # Check hf ROPE \n",
+    "    cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
+    "    cos = cos.cuda()\n",
+    "    sin = sin.cuda()\n",
+    "    # query_states:  torch.Size([1, 12, 24, 64])\n",
+    "    # key_states:  torch.Size([1, 12, 24, 64])\n",
+    "    # position_ids:  torch.Size([1, 24])\n",
+    "    # tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "    #          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "    query_states = torch.zeros([1, 12, 24, 64]).cuda()\n",
+    "    position_ids = torch.arange(24).unsqueeze(0).cuda()\n",
+    "    query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n",
+    "    hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
+    "    # print(hf_kproj_grads_post_rotary[:,:,0])\n",
+    "    \n",
+    "    hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n",
+    "    hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
+    "    hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
+    "    # print(hf_kproj_grads_before_rotary[:,:,0])\n",
+    "    # Compare HF rope with manual ROPE\n",
+    "    assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "    # Compare HF Kproj with FF Kproj (before ROPE) \n",
+    "    ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n",
+    "    ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "    # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
+    "    #print(ff_kproj_pre[:,:,0])\n",
+    "    mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "    #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "    \n",
+    "    ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n",
+    "    ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "    # print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "    #print(ff_kproj[:,:,0])\n",
+    "    mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "    #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "    \n",
+    "    \n",
+    "    #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
+    "    hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n",
+    "    hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
+    "    #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
+    "    #print(hf_kproj_grads[:,:64])\n",
+    "    reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "    #print(reshaped_tensor.shape)\n",
+    "    assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    ########################################## Qproj (with ROPE) ##########################################\n",
+    "\n",
+    "    # Compare QProj\n",
+    "    hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n",
+    "    hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
+    "    # print(\"HF Qproj:\")\n",
+    "    # print(hf_qproj_grads.shape)\n",
+    "    reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "    # print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
+    "    # print(reshaped_tensor[:,:,0])\n",
+    "    ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
+    "    ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
+    "    # print(\"FF Qproj:\")\n",
+    "    # print(ff_qproj.shape)\n",
+    "    # print(ff_qproj[:,:,0])\n",
+    "    assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n",
+    "    hf_attn_in = torch.load(hf_attn_in)\n",
+    "    # print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    hf_attn_in = hf_attn_in.squeeze().T\n",
+    "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
+    "    # print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    # print(hf_attn_in)\n",
+    "\n",
+    "    ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n",
+    "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
+    "    # print(\"ff_attn_in: \", ff_attn_in.shape)\n",
+    "    # print(ff_attn_in)\n",
+    "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
+    "\n",
+    "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
+    "    mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "    \n",
+    "\n",
+    "    assert False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([12, 24, 64])\n",
+      "tensor([[-1.5730e-02, -4.1161e-02,  3.0593e-02,  ...,  3.8630e-01,\n",
+      "          3.2884e-01,  3.6067e-01],\n",
+      "        [-2.8613e+01, -5.5872e+00,  2.9385e+01,  ...,  3.8782e+01,\n",
+      "          9.6901e+01,  9.8470e+01],\n",
+      "        [ 3.3027e+00,  1.8276e-01, -1.8497e+00,  ..., -4.4052e+01,\n",
+      "         -2.0010e+01, -2.9788e+01],\n",
+      "        ...,\n",
+      "        [-7.6471e-02, -1.8892e-01,  3.6430e-01,  ..., -2.7493e-01,\n",
+      "          5.7017e-01, -1.5986e-01],\n",
+      "        [ 2.5780e+00, -1.8153e+00,  2.5088e+00,  ..., -1.0776e+01,\n",
+      "          6.2167e-01,  8.3755e-01],\n",
+      "        [-6.8324e-02,  1.7568e-01, -3.2311e-01,  ...,  3.1202e+00,\n",
+      "         -2.6652e-01, -1.1917e+00]])\n",
+      "(24, 64, 12)\n",
+      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
+      "   3.2884139e-01  3.6066702e-01]\n",
+      " [-2.8613457e+01 -5.5871558e+00  2.9384506e+01 ...  3.8781765e+01\n",
+      "   9.6900581e+01  9.8469597e+01]\n",
+      " [ 3.3027239e+00  1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n",
+      "  -2.0009745e+01 -2.9787930e+01]\n",
+      " ...\n",
+      " [-7.6470733e-02 -1.8891659e-01  3.6430117e-01 ... -2.7492592e-01\n",
+      "   5.7017130e-01 -1.5985624e-01]\n",
+      " [ 2.5780225e+00 -1.8152566e+00  2.5087588e+00 ... -1.0776262e+01\n",
+      "   6.2166649e-01  8.3755457e-01]\n",
+      " [-6.8324409e-02  1.7568478e-01 -3.2310838e-01 ...  3.1202292e+00\n",
+      "  -2.6652411e-01 -1.1917179e+00]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# value states: torch.Size([1, 12, 24, 64])\n",
+    "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n",
+    "key_states = value_states\n",
+    "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n",
+    "# query_states:  torch.Size([1, 12, 24, 64])\n",
+    "# key_states:  torch.Size([1, 12, 24, 64])\n",
+    "# position_ids:  torch.Size([1, 24])\n",
+    "# tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "#          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "query_states = torch.zeros([1, 12, 24, 64])\n",
+    "position_ids = torch.arange(24).unsqueeze(0)\n",
+    "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n",
+    "key_states = key_states.squeeze()\n",
+    "print(key_states.shape)\n",
+    "print(key_states[0,:,:])\n",
+    "print(hf_kproj_grads_before_rotary.shape)\n",
+    "print(hf_kproj_grads_before_rotary[:,:,0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+       "         18, 19, 20, 21, 22, 23]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.arange(24).unsqueeze(0).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 12, 24, 24])\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "layer_num = 11\n",
+    "hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n",
+    "ff_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "\n",
+    "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n",
+    "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n",
+    "print(hf_qk_prods_softmax.shape)\n",
+    "#print(ff_qk_prods_softmax.shape)\n",
+    "#print(hf_qk_prods_softmax[:,:,0])\n",
+    "#print()\n",
+    "#print(ff_qk_prods_softmax[:,:,0])\n",
+    "\n",
+    "for head_idx in range(12):\n",
+    "    hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "    ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "    assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "\n",
+    "\n",
+    "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n",
+    "print(hf_value_states.shape)\n",
+    "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n",
+    "print()\n",
+    "print(attn_output.shape)\n",
+    "print(attn_output.transpose(1, 2).contiguous().shape)\n",
+    "print(\"Hf attn heads\")\n",
+    "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n",
+    "\n",
+    "print(\"Attn heads grads:\")\n",
+    "hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "print(torch.load(hf_attn_heads_grads).shape)\n",
+    "print(\"HF value grads:\")\n",
+    "vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "print(torch.load(vproj_grads).shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([2, 3, 4])\n",
+      "torch.Size([4, 3, 2])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(2,3,4)\n",
+    "print(a.shape)\n",
+    "print(a.T.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[   0.0000,    0.0000,    0.0000,  ...,    0.0000,    0.0000,\n",
+      "             0.0000],\n",
+      "         [  27.8890,  -21.5089,   45.8214,  ...,    5.4010,  -10.8787,\n",
+      "            39.7619],\n",
+      "         [  19.2197,   27.4681,  -68.7141,  ...,  102.3280,   66.7925,\n",
+      "          -160.8711],\n",
+      "         ...,\n",
+      "         [  63.9532,   17.4273,  -29.4416,  ...,  101.6105,   67.5937,\n",
+      "          -198.4432],\n",
+      "         [  31.2799,   13.0724,  -44.7179,  ...,  132.4898,   42.3135,\n",
+      "          -194.4037],\n",
+      "         [  42.3453,  -16.2693,  -55.7386,  ...,   90.5921,   52.2032,\n",
+      "          -124.1802]]], device='cuda:0')\n",
+      "tensor([[[-1.1845e+06, -6.7460e+05,  7.4494e+05,  ..., -9.1441e+05,\n",
+      "          -1.4912e+05,  3.5769e+06],\n",
+      "         [-7.3920e+01, -7.9389e+01,  1.1027e+02,  ..., -7.3020e+01,\n",
+      "          -2.3540e+01,  3.4587e+02],\n",
+      "         [-5.3885e+01, -1.7373e+01, -1.9780e+01,  ...,  4.1291e+01,\n",
+      "           5.5099e+01,  5.5910e+01],\n",
+      "         ...,\n",
+      "         [-2.1948e+01, -3.2109e+01,  2.8364e+01,  ...,  3.4321e+01,\n",
+      "           5.0713e+01,  5.6592e+01],\n",
+      "         [-4.4339e+01, -2.8339e+01,  1.4070e+01,  ...,  6.2797e+01,\n",
+      "           3.0760e+01,  6.1743e+01],\n",
+      "         [-1.6287e+01, -5.0413e+01, -1.9940e+01,  ...,  4.3766e+01,\n",
+      "           4.7833e+01,  4.7295e+01]]], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n",
+    "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n",
+    "a = torch.load(a)\n",
+    "b = torch.load(b)\n",
+    "print(a)\n",
+    "print(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Manual matmul checks\n",
+    "# ff_w2_grad_out_tensor = np.loadtxt(ff_BWD_w2_out, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_w2_weight_tensor = np.loadtxt(ff_w2_weight, delimiter=',').reshape((3072,768), order='F')\n",
+    "# ff_w2_gradin_tensor = np.matmul(ff_w2_weight_tensor, ff_w2_grad_out_tensor).reshape((3072,128), order='F')\n",
+    "\n",
+    "# ff_lora_gradout_tensor = np.loadtxt(ff_BWD_lora_B_out, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_lora_A_weight_tensor = np.loadtxt(ff_lora_A_weight, delimiter=',').reshape((3072,16), order='F')\n",
+    "# ff_lora_B_weight_tensor = np.loadtxt(ff_lora_B_weight, delimiter=',').reshape((16,768), order='F')\n",
+    "# ff_lora_int_grad_tensor = np.matmul(ff_lora_B_weight_tensor, ff_lora_gradout_tensor)\n",
+    "# ff_lora_gradint_tensor = np.matmul(ff_lora_A_weight_tensor, ff_lora_int_grad_tensor)\n",
+    "\n",
+    "# # ff_w2_gradin_tensor = ff_w2_gradin_tensor + ff_lora_gradint_tensor\n",
+    "# #print(ff_w2_gradin_tensor[:,:24])\n",
+    "# print(\"calculated LORA grad in\")\n",
+    "# print(ff_lora_gradint_tensor[:,:24])\n",
+    "# # ff_BWD_w2_in_pre_tensor = np.loadtxt(ff_BWD_w2_in_pre, delimiter=',').reshape((3072,128), order='F')\n",
+    "# ff_BWD_lora_A_in_tensor = np.loadtxt(ff_BWD_lora_A_in, delimiter=',').reshape((3072,128), order='F')\n",
+    "# print(\"FlexFlow LORA grad in\")\n",
+    "# print(ff_BWD_lora_A_in_tensor[:,:24])\n",
+    "# # print(ff_BWD_w2_in_pre_tensor[:,:24])\n",
+    "# print(\"HF lora grad in\")\n",
+    "# print(torch.load(hf_BWD_loraA_in).squeeze().T.detach().cpu().numpy())\n",
+    "# compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "\n",
+    "# simulate act_fn_grad\n",
+    "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# w3_fwd_out_tensor = np.loadtxt(ff_FWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# #print(ssm_out_grad_tensor.shape, w3_fwd_out_tensor.shape)\n",
+    "# act_fn_out_check = np.multiply(ssm_out_grad_tensor, w3_fwd_out_tensor)\n",
+    "# print(\"simulated act fn out - simulated\")\n",
+    "# print(act_fn_out_check[:,:24])\n",
+    "# print(\"simulated act fn out - HF\")\n",
+    "# print(torch.load(hf_BWD_act_fn_out).detach().cpu().numpy().squeeze().T)\n",
+    "\n",
+    "# Simulated w3_grad\n",
+    "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')[:,:24]\n",
+    "# act_fnc_out_tensor = np.loadtxt(ff_FWD_act_fnc_out, delimiter=',').reshape((3072,24), order='F')\n",
+    "# w3_out_gard_check = np.multiply(ssm_out_grad_tensor, act_fnc_out_tensor)\n",
+    "# print(\"simulated w3 out - FF\")\n",
+    "# print(w3_out_gard_check)\n",
+    "# ff_BWD_w3_out_tensor = np.loadtxt(ff_BWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# hf_BWD_w3_out_tensor = torch.load(hf_BWD_w3_out).detach().cpu().numpy().squeeze().T\n",
+    "# print(\"w3 out, FF\")\n",
+    "# print(ff_BWD_w3_out_tensor[:,:24])\n",
+    "# print(\"w3 out, HF\")\n",
+    "# print(hf_BWD_w3_out_tensor)\n",
+    "\n",
+    "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n",
+    "# assert False\n",
+    "# print()\n",
+    "# print()\n",
+    "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n",
+    "# print_tensors(hf_BWD_w3_in, ff_BWD_w3_in, \"w3 in\")\n",
+    "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out, \"w1 out\")\n",
+    "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in, \"w1 in\")\n",
+    "# print_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out, \"ffn norm out\")\n",
+    "# print_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2, \"ffn norm in\")\n",
+    "# print()\n",
+    "# ff_w1_out_tensor = np.loadtxt(ff_BWD_w1_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# ff_w1_in_tensor = np.loadtxt(ff_BWD_w1_in, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_w1_in_pre_tensor = np.loadtxt(ff_BWD_w1_in_pre, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_w1_only_in_tensor = ff_w1_in_tensor - ff_w1_in_pre_tensor\n",
+    "# ff_w1_weight_tensor = np.loadtxt(ff_w1_weight, delimiter=',').reshape((768,3072), order='F')\n",
+    "# ff_w1_in_check_tensor = np.matmul(ff_w1_weight_tensor, ff_w1_out_tensor)\n",
+    "# print(\"W1 in (simulated):\")\n",
+    "# print(ff_w1_in_check_tensor[:,:24])\n",
+    "# print(\"W1 in (FF):\")\n",
+    "# print(ff_w1_only_in_tensor[:,:24])\n",
+    "# print(\"W1 in (HF):\")\n",
+    "# print(torch.load(hf_BWD_w1_in).squeeze().T.detach().cpu().numpy())\n",
+    "\n",
+    "# compare_tensors_difference(hf_BWD_w2_in, ff_BWD_w2_in, ff_BWD_lora_A_in)\n",
+    "# compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "#compare_hf_tensors(hf_BWD_ffn_norm_in, hf_BWD_attn_out_out)\n",
+    "# print(\"\\nw1 out:\")\n",
+    "\n",
+    "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "# print(\"\\nW1 in\\n\")\n",
+    "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n",
+    "# compare_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n",
+    "# print(\"\\nffn_norm\")\n",
+    "# compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "for layer_num in range(12):\n",
+    "    hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n",
+    "    hf_w1_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.gate_proj.weight\"\n",
+    "    ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n",
+    "    hf_w3_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.up_proj.weight\"\n",
+    "    ff_w3_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n",
+    "    hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/qk_prods_alignment.ipynb b/tests/peft/qk_prods_alignment.ipynb
new file mode 100644
index 0000000000..c2a3644b3d
--- /dev/null
+++ b/tests/peft/qk_prods_alignment.ipynb
@@ -0,0 +1,24 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 20289009b26042bcd9527fc8b696e22c2e28ef75 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 9 Jan 2024 22:45:17 -0500
Subject: [PATCH 122/198] Fuse bias + relu in OPT (#1271)

* fuse bias and relu in opt

* fix
---
 include/flexflow/model.h            |  3 ++-
 python/flexflow/serve/models/opt.py |  5 ++---
 src/ops/kernels/linear_kernels.cu   | 22 ++++++++++++++++++++++
 src/runtime/model.cc                | 27 ++++++++++++++++++++++++---
 4 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 7232cb3f0b..851fac94d2 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1114,7 +1114,7 @@ class FFModel {
   std::unordered_map<Op *, std::vector<std::pair<Op *, int>>>
       get_bwd_edge_map() const;
 
-  // Internal funcitons
+  // Internal functions
   Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc);
   Legion::IndexSpace get_or_create_task_is(MachineView const &view);
   Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain);
@@ -1122,6 +1122,7 @@ class FFModel {
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
+  bool is_mlp_block(int layer_idx) const;
   void create_operators_from_layers();
   Op *create_operator_from_layer(Layer *layer,
                                  std::vector<ParallelTensor> const &inputs);
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 8250c63a9a..4b0b613cca 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -216,13 +216,12 @@ def build_model(self, max_tokens_per_batch):
             fc1 = ffmodel.dense(
                 ff_norm,
                 self.opt_config.ffn_dim,
-                ActiMode.AC_MODE_NONE,
+                ActiMode.AC_MODE_RELU,
                 True,
                 name=f"layers_{i}_fc1",
             )
-            activation = ffmodel.relu(fc1, False)
             fc2 = ffmodel.dense(
-                activation,
+                fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 8cf5db3f11..51b5e1f6f5 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -306,6 +306,18 @@ Parameter* Linear::get_parameter(int index)
 */
 namespace Internal {
 
+template <typename DT>
+__global__ void AddBiasWithReLU(DT *output_ptr,
+                                DT const *bias_ptr,
+                                int out_dim,
+                                int batch_size) {
+  CUDA_KERNEL_LOOP(i, out_dim * batch_size) {
+    int bias_idx = i % out_dim;
+    DT value = output_ptr[i] + bias_ptr[bias_idx];
+    output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f;
+  }
+}
+
 template <typename DT>
 void forward_kernel(LinearMeta const *m,
                     void const *input_ptr,
@@ -398,6 +410,16 @@ void forward_kernel(LinearMeta const *m,
                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   // use_bias = True
   if (bias_ptr != NULL) {
+    // fuse bias and relu
+    if (m->activation == AC_MODE_RELU) {
+      int parallelism = out_dim * batch_size;
+      AddBiasWithReLU<<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream>>>(
+          static_cast<DT *>(output_ptr),
+          static_cast<DT const *>(bias_ptr),
+          out_dim,
+          batch_size);
+      return;
+    }
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2ee4d4bc08..2048a2c6a2 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3249,6 +3249,27 @@ Op *FFModel::create_operator_from_layer(
   }
 }
 
+bool FFModel::is_mlp_block(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  if (l->op_type != OP_LINEAR) {
+    return false;
+  }
+  // standard opt relu
+  if (layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_RELU &&
+      layers[layer_idx - 2]->op_type == OP_LINEAR) {
+    return true;
+  }
+  // mlp layer with relu embedded in first dense layer
+  long long value;
+  l->get_int_property("activation", value);
+  ActiMode activation = (ActiMode)value;
+  if (layer_idx >= 1 && layers[layer_idx - 1]->op_type == OP_LINEAR &&
+      activation == AC_MODE_RELU) {
+    return true;
+  }
+  return false;
+}
+
 void FFModel::create_operators_from_layers() {
   std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
   // for (auto const &l : layers) {
@@ -3293,9 +3314,9 @@ void FFModel::create_operators_from_layers() {
                config.tensor_parallelism_degree > 1 &&
                (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
                 l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-                (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-                 layers[layer_idx - 1]->op_type == OP_RELU &&
-                 layers[layer_idx - 2]->op_type == OP_LINEAR) ||
+                // mlp layer
+                is_mlp_block(layer_idx) ||
+                // llama mlp layer
                 (l->op_type == OP_LINEAR && layer_idx >= 2 &&
                  layers[layer_idx - 1]->op_type == OP_GELU &&
                  layers[layer_idx - 2]->op_type == OP_LINEAR) ||

From 3bbde567361eb077e2178a38fa756eb199f9a8e2 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 10 Jan 2024 15:49:06 +0000
Subject: [PATCH 123/198] fix

---
 src/runtime/model.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2048a2c6a2..812a432ef1 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3251,21 +3251,21 @@ Op *FFModel::create_operator_from_layer(
 
 bool FFModel::is_mlp_block(int layer_idx) const {
   auto const &l = layers[layer_idx];
-  if (l->op_type != OP_LINEAR) {
-    return false;
-  }
   // standard opt relu
-  if (layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_RELU &&
+  if (l->op_type == OP_LINEAR && layer_idx >= 2 &&
+      layers[layer_idx - 1]->op_type == OP_RELU &&
       layers[layer_idx - 2]->op_type == OP_LINEAR) {
     return true;
   }
   // mlp layer with relu embedded in first dense layer
-  long long value;
-  l->get_int_property("activation", value);
-  ActiMode activation = (ActiMode)value;
-  if (layer_idx >= 1 && layers[layer_idx - 1]->op_type == OP_LINEAR &&
-      activation == AC_MODE_RELU) {
-    return true;
+  if (l->op_type == OP_LINEAR && layer_idx >= 1 &&
+      layers[layer_idx - 1]->op_type == OP_LINEAR) {
+    long long value;
+    layers[layer_idx - 1]->get_int_property("activation", value);
+    ActiMode activation = (ActiMode)value;
+    if (activation == AC_MODE_RELU) {
+      return true;
+    }
   }
   return false;
 }

From 2ebd7f4d40661303f7097334618d52297e479f90 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 16 Jan 2024 21:44:24 -0500
Subject: [PATCH 124/198] fix

---
 include/flexflow/ops/kernels/linear_kernels.h |   9 ++
 src/ops/kernels/linear_kernels.cu             | 107 ++++++++++++++++++
 src/ops/linear.cc                             |  29 ++---
 3 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index ff33755780..bcce9a947a 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -50,6 +50,15 @@ void forward_kernel_wrapper(LinearMeta const *m,
                             int in_dim,
                             int out_dim,
                             int batch_size);
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *filter_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size);
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
                              void *input_grad_ptr,
                              void *output_grad_ptr,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 51b5e1f6f5..5306be9bdf 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -170,6 +170,113 @@ void forward_kernel_wrapper(LinearMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::forward_kernel<float>(m,
+                                    input_ptr,
+                                    output_ptr,
+                                    weight_ptr,
+                                    bias_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    batch_size,
+                                    stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::forward_kernel<half>(m,
+                                  input_ptr,
+                                  output_ptr,
+                                  weight_ptr,
+                                  bias_ptr,
+                                  in_dim,
+                                  out_dim,
+                                  batch_size,
+                                  stream);
+  }
+
+
+  if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
+    // save input activation if needed for PEFT
+    if (bc->num_active_peft_tokens() > 0) {
+      // Check that we have at most one request that requires peft_bwd
+      int num_peft_requests = 0;
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_bwd) {
+          num_peft_requests++;
+        }
+      }
+      assert(num_peft_requests <= 1);
+
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        // Skip non-PEFT requests
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+        int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+        if (bc->requestsInfo[i].peft_bwd) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->output_activation_buffer = allocator->allocate_instance_untyped(
+              data_type_size(m->output_type[0]) * num_peft_tokens * out_dim);
+          // copy output activation
+          if (m->output_type[0] == DT_FLOAT) {
+            checkCUDA(cudaMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<float*>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                cudaMemcpyDeviceToDevice,
+                stream));
+          } else if (m->output_type[0] == DT_HALF) {
+            checkCUDA(cudaMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<half*>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                cudaMemcpyDeviceToDevice,
+                stream));
+          } else {
+            assert(false && "unsupport datatype in layernorm");
+          }
+        }
+      }
+    }
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [Linear] inference time = %.2lfms\n", m->op_name, elapsed);
+  }
+}
+
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
                              void *input_grad_ptr,
                              void *output_grad_ptr,
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 15789ae2e9..e23a6f48ca 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -652,14 +652,15 @@ void Linear::inference_task(Task const *task,
                                             runtime);
     assert(bias.domain.get_volume() == static_cast<size_t>(out_dim));
   }
-  forward_kernel_wrapper(m,
-                         input.ptr,
-                         output.ptr,
-                         weight.ptr,
-                         bias.ptr,
-                         in_dim,
-                         out_dim,
-                         batch_size);
+  inference_kernel_wrapper(m,
+                          bc,
+                          input.ptr,
+                          output.ptr,
+                          weight.ptr,
+                          bias.ptr,
+                          in_dim,
+                          out_dim,
+                          batch_size);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -719,14 +720,6 @@ FutureMap Linear::peft_bwd(FFModel const &ff,
                         weights[0]->region,
                         ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
   launcher.add_field(2, FID_DATA);
-  if (use_bias) {
-    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[1]->region));
-    launcher.add_field(3, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -741,8 +734,8 @@ void Linear::peft_bwd_task(Task const *task,
   if (bc->num_active_peft_tokens() == 0) {
     return;
   }
-  assert(regions.size() == (3 + static_cast<size_t>(m->use_bias)));
-  assert(task->regions.size() == (3 + static_cast<size_t>(m->use_bias)));
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3 );
   if (m->quantization_type == DT_NONE) {
     assert(m->input_type[0] == m->weight_type[0]);
   }

From 1b2018b445fe49ea5cbb59fb5dcb30ad814340c8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 16 Jan 2024 21:59:11 -0500
Subject: [PATCH 125/198] fix

---
 src/ops/add_bias_residual_layer_norm.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 88a34b7eb5..a2b426ec0d 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -967,7 +967,7 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
   assert(task->regions.size() == regions.size());
   AddBiasResidualLayerNormMeta *m =
       *((AddBiasResidualLayerNormMeta **)task->local_args);
-  assert(regions.size() == 4 + m->elementwise_affine);
+  assert(regions.size() == 3 + m->elementwise_affine);
 
   int region_idx = 0, task_region_idx = 0;
 
@@ -995,7 +995,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
 
   GenericTensorAccessorR gamma;
   if (m->elementwise_affine) {
-    assert(m->use_bias == (regions.size() == 6));
     gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
                                              regions[region_idx++],
                                              task->regions[task_region_idx++],

From bc61e9ddbe33ced6574fbf91fafc26212b8a6f56 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 26 Jan 2024 20:39:06 -0500
Subject: [PATCH 126/198] Peft alignment & debugging tools (#1288)

* Revert "several hacks for performance measurement; some of the changes should be reverted"

This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c.

* backup

* backup

* updates

* update

* backup

* backup

* backup

* fix

* cleanup

* fix

* fix

* fix

* update

* simplify tensor names

* fix

* fixes and updates

* fixes

* fix

* cleanup

* .

* restore softmax

* cleanup

* update alignment scripts

* newline
---
 .gitignore                                    |    3 +-
 include/flexflow/operator.h                   |   26 +-
 src/ops/add_bias_residual_layer_norm.cu       |   84 +-
 src/ops/argmax.cc                             |    5 -
 src/ops/inc_multihead_self_attention.cu       |  102 +
 src/ops/kernels/linear_kernels.cu             |   21 +-
 src/ops/kernels/residual_rms_norm_kernels.cu  |   45 +-
 src/ops/kernels/rms_norm_kernels.cu           |   76 +-
 src/ops/layer_norm.cu                         |   69 +-
 src/ops/linear.cc                             |   20 +-
 src/ops/lora_linear.cc                        |   34 +-
 src/ops/residual_layer_norm.cu                |  188 +-
 tests/peft/alignment/align_test_utils.py      |  240 ++
 .../alignment/llama_alignment_tests.ipynb     | 2039 +++++++++++++++++
 .../peft/alignment/opt_alignment_tests.ipynb  |  450 ++++
 tests/peft/alignment_tests.ipynb              | 1427 ------------
 tests/peft/hf_finetune.py                     |   70 +-
 tests/peft/hf_serve.py                        |    8 -
 tests/peft/qk_prods_alignment.ipynb           |   24 -
 19 files changed, 3161 insertions(+), 1770 deletions(-)
 create mode 100644 tests/peft/alignment/align_test_utils.py
 create mode 100644 tests/peft/alignment/llama_alignment_tests.ipynb
 create mode 100644 tests/peft/alignment/opt_alignment_tests.ipynb
 delete mode 100644 tests/peft/alignment_tests.ipynb
 delete mode 100644 tests/peft/qk_prods_alignment.ipynb

diff --git a/.gitignore b/.gitignore
index 0579eb5a74..23da3c5899 100644
--- a/.gitignore
+++ b/.gitignore
@@ -189,4 +189,5 @@ python/flexflow/version.txt
 inference_tensors
 hf_peft_tensors
 
-Untitled-1.ipynb
\ No newline at end of file
+Untitled-1.ipynb
+Untitled-2.ipynb
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index e3f28756ec..2dfba77b77 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -267,7 +267,7 @@ class Op {
       bool fwd_pass = true,
       bool before_kernel = false) {
     // Check if output directory exists, and create it if it does not
-    char const *folder_path = "./inference_tensors";
+    char const *folder_path = "./inference_tensors/";
     struct stat st = {0};
     if (stat(folder_path, &st) == -1) {
       // Directory does not exist, create it
@@ -275,20 +275,26 @@ class Op {
     }
     // output base filepath, shared by all tensors from the same operator
     std::string op_name_without_uid = get_op_name_without_uid(m);
-    std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        (fwd_pass ? "_decoding-step_" : "_bwd-step_") +
-        (fwd_pass ? std::to_string(m->decoding_step)
-                  : std::to_string(m->bwd_step)) +
-        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
-        "_layer-name_" + op_name_without_uid + "_shard-id_" +
-        std::to_string(shard_id);
+    std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
+              << std::endl;
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    if (fwd_pass) {
+      base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    } else {
+      base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    }
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(shard_id);
     if (before_kernel) {
       base_filepath += "_pre";
     }
     // save batch config, if passed
     if (bc != nullptr) {
-      bc->save_to_file(base_filepath + "_batch-config");
+      bc->save_to_file(base_filepath + "_batch_config");
     }
     // save all inputs
     for (int i = 0; i < input_tensors.size(); i++) {
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ab017ed46c..505806a2b9 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -91,25 +91,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
 template <typename T>
 __global__ void LayerNormFusedForwardKernel(int64_t N,
                                             int64_t attn_bias_dim,
@@ -128,20 +109,17 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const int64_t bias_idx = index % attn_bias_dim;
     X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index];
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -153,7 +131,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -179,30 +157,22 @@ void AddBiasResidualLayerNorm::inference_kernel(
     T const *gamma_ptr,
     T const *beta_ptr,
     cudaStream_t stream) {
-
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   LayerNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               attn_bias_dim,
-                                               m->eps,
-                                               input_ptr,
-                                               attn_bias_ptr,
-                                               residual_ptr,
-                                               added_output_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               output_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   attn_bias_dim,
+                   m->eps,
+                   input_ptr,
+                   attn_bias_ptr,
+                   residual_ptr,
+                   added_output_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   output_ptr);
 }
 
 /*static*/
@@ -242,20 +212,17 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
     }
     assert(num_peft_requests <= 1);
 
-    int tokens_previous_requests = 0;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
       // Skip non-PEFT requests
       if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-        // FIXME: use the new approach to computing token offset
-        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int in_dim =
-          added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
         m->input_activation = allocator->allocate_instance_untyped(
@@ -264,14 +231,14 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              added_output.get_float_ptr() + tokens_previous_requests * in_dim,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
         } else if (m->input_type[0] == DT_HALF) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              added_output.get_half_ptr() + tokens_previous_requests * in_dim,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
@@ -281,6 +248,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
       }
     }
   }
+
   // inference kernel
   int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
   int residual_volume = residual.domain.get_volume();
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index dd0e2bb822..cabb8b204f 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -392,11 +392,6 @@ InferenceResult
   GenericTensorAccessorW parent;
   int batch_size = bc->num_active_infr_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
-  // Note that we free activation allocator here since argmax is the
-  // last operator in forward
-  if (m->handle.peft_activation_allocator != nullptr) {
-    m->handle.peft_activation_allocator->free_all();
-  }
   InferenceResult ir;
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 452a8c09f6..4c3b0ee4b6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -907,6 +907,22 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
 }
 
+std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                                int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  char const *folder_path = "./inference_tensors/";
+  std::string base_filepath = std::string(folder_path);
+  if (m->layer_guid.model_id > 0) {
+    base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+  }
+  base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+  base_filepath += "_layers_" +
+                   std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                   op_name_without_uid + "_shard_" + std::to_string(shard_id);
+  return base_filepath;
+}
+
 template <typename DT>
 void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      BatchConfig const *bc,
@@ -934,6 +950,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   //   }
   // #endif
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -995,6 +1012,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_o_proj_in_grad";
+        save_tensor(C, m_ * n_, filename.c_str());
+      }
     }
     // Step 2: compute gradients w.r.t. value
     {
@@ -1046,6 +1069,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_v_proj_in_grad";
+        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + "_qk_prods_softmax";
+        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
+      }
     }
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {
@@ -1094,6 +1126,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_qk_prods_softmax_grad";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+        std::string filename2 = get_peft_dbg_folder(m, shard_id) + "_vcache";
+        save_tensor(
+            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
+      }
     }
     // Step 4: softmax backpropagation
     {
@@ -1120,6 +1161,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                       &beta,
                                       m->qk_tensor,
                                       m->qk_prods));
+
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_qk_prods_softmax_grad_in";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+
       //  TODO: fill all elements above diagonal to force causal attention
       size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
       if (entries_above_diagonal > 0) {
@@ -1135,6 +1185,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                                 entries_above_diagonal,
                                                 DT(0.0f));
       }
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename = get_peft_dbg_folder(m, shard_id) +
+                               "_qk_prods_softmax_grad_in_masked";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
     }
     // Step 5: compute gradients w.r.t. key
     {
@@ -1189,6 +1246,16 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_query_activation";
+        save_tensor(
+            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + "_devkproj_pre";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
+      }
     }
     // Step 6: compute gradients w.r.t query
     {
@@ -1239,7 +1306,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
     }
+
     // Step 7: perform rotary position embeddings (RoPE) bwd
     {
       if (*m->apply_rotary_embedding) {
@@ -1257,8 +1332,30 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                                m->qProjSize,
                                                num_tokens,
                                                m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + "_devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + "_devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
       }
     }
+
     // Step 8: compute gradients w.r.t. input
     {
       float alpha = 1.0f, beta = 0.0f;
@@ -1300,6 +1397,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + "_attn_final_grad_in";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
     }
   }
 }
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 5306be9bdf..a3f5c797de 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -200,17 +200,16 @@ void inference_kernel_wrapper(LinearMeta *m,
                                     stream);
   } else if (m->input_type[0] == DT_HALF) {
     Internal::forward_kernel<half>(m,
-                                  input_ptr,
-                                  output_ptr,
-                                  weight_ptr,
-                                  bias_ptr,
-                                  in_dim,
-                                  out_dim,
-                                  batch_size,
-                                  stream);
+                                   input_ptr,
+                                   output_ptr,
+                                   weight_ptr,
+                                   bias_ptr,
+                                   in_dim,
+                                   out_dim,
+                                   batch_size,
+                                   stream);
   }
 
-
   if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
     // save input activation if needed for PEFT
     if (bc->num_active_peft_tokens() > 0) {
@@ -247,14 +246,14 @@ void inference_kernel_wrapper(LinearMeta *m,
           if (m->output_type[0] == DT_FLOAT) {
             checkCUDA(cudaMemcpyAsync(
                 m->output_activation_buffer,
-                static_cast<float*>(output_ptr) + first_token_offset * out_dim,
+                static_cast<float *>(output_ptr) + first_token_offset * out_dim,
                 data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
                 cudaMemcpyDeviceToDevice,
                 stream));
           } else if (m->output_type[0] == DT_HALF) {
             checkCUDA(cudaMemcpyAsync(
                 m->output_activation_buffer,
-                static_cast<half*>(output_ptr) + first_token_offset * out_dim,
+                static_cast<half *>(output_ptr) + first_token_offset * out_dim,
                 data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
                 cudaMemcpyDeviceToDevice,
                 stream));
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index b12d105c1b..664c1ed13b 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -221,7 +221,28 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
   assert(weight.data_type == output.data_type);
   assert(residual_output.data_type == output.data_type);
 
-  // save input activation if needed for PEFT
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input1.get_half_ptr(),
+                   input2.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   residual_output.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input1.get_float_ptr(),
+                   input2.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   residual_output.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  // save input activation if needed for PEFT. This must be done after the
+  // forward kernel since that's where we add the residual
   if (bc->num_active_peft_tokens() > 0) {
     // Check that we have at most one request that requires peft_bwd
     int num_peft_requests = 0;
@@ -247,7 +268,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
@@ -275,26 +296,6 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
     }
   }
 
-  if (output.data_type == DT_HALF) {
-    forward_kernel(m,
-                   input1.get_half_ptr(),
-                   input2.get_half_ptr(),
-                   weight.get_half_ptr(),
-                   residual_output.get_half_ptr(),
-                   output.get_half_ptr(),
-                   stream);
-  } else if (output.data_type == DT_FLOAT) {
-    forward_kernel(m,
-                   input1.get_float_ptr(),
-                   input2.get_float_ptr(),
-                   weight.get_float_ptr(),
-                   residual_output.get_float_ptr(),
-                   output.get_float_ptr(),
-                   stream);
-  } else {
-    assert(false && "Unsupported data type");
-  }
-
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index d0702d651e..b11e954622 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -201,53 +201,53 @@ void inference_kernel_wrapper(RMSNormMeta *m,
 
   // save input activation if needed for PEFT
   if (bc->num_active_peft_tokens() > 0) {
-    // check that at most one dimension after the first is > 1. TODO(goliaro):
-    // support case where this condition does not hold
-    int non_unit_dims_encountered = 0;
-    for (int i = 1; i < input.domain.get_dim(); i++) {
-      int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1;
-      if (dim_i > 1) {
-        non_unit_dims_encountered++;
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
       }
     }
-    assert(non_unit_dims_encountered <= 1);
-
-    // allocate space for all peft tokens
-    MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-    int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-    m->input_activation = allocator->allocate_instance_untyped(
-        data_type_size(input.data_type) * bc->num_active_peft_tokens() *
-        in_dim);
-
-    int tokens_previous_requests = 0;
+    assert(num_peft_requests <= 1);
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
-      // Skip non-PEFT requests and PEFT forward-only requests
-      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
-          !bc->requestsInfo[i].peft_bwd) {
-        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-
-      if (input.data_type == DT_FLOAT) {
-        checkCUDA(cudaMemcpyAsync(
-            m->input_activation,
-            input.get_float_ptr() + tokens_previous_requests * in_dim,
-            data_type_size(input.data_type) * num_peft_tokens * in_dim,
-            cudaMemcpyDeviceToDevice,
-            stream));
-      } else if (input.data_type == DT_HALF) {
-        checkCUDA(cudaMemcpyAsync(
-            m->input_activation,
-            input.get_half_ptr() + tokens_previous_requests * in_dim,
-            data_type_size(input.data_type) * num_peft_tokens * in_dim,
-            cudaMemcpyDeviceToDevice,
-            stream));
-      } else {
-        assert(false && "unsupport datatype in layernorm");
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->input_activation = allocator->allocate_instance_untyped(
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+
+        if (input.data_type == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (input.data_type == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
       }
     }
   }
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 1d4e94d7d5..bfbb2faae9 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -96,25 +96,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
 template <typename T>
 __global__ void LayerNormFusedForwardKernel(int64_t N,
                                             float eps,
@@ -129,18 +110,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -152,7 +128,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -173,25 +149,18 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m,
                                T const *beta_ptr,
                                cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   LayerNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               m->eps,
-                                               in_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               out_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   in_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   out_ptr);
 }
 
 /*static*/
@@ -276,18 +245,16 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
     }
     assert(num_peft_requests <= 1);
 
-    int tokens_previous_requests = 0;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
       // Skip non-PEFT requests
       if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-        // FIXME: use the new approach to computing token offset
-        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
@@ -297,14 +264,14 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              input.get_float_ptr() + tokens_previous_requests * in_dim,
+              input.get_float_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
         } else if (m->input_type[0] == DT_HALF) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              input.get_half_ptr() + tokens_previous_requests * in_dim,
+              input.get_half_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index e23a6f48ca..209f514f65 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -621,6 +621,8 @@ void Linear::inference_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
+  printf("INF %s\n", op_name_without_uid.c_str());
   if (bc->num_tokens == 0) {
     return;
   }
@@ -653,14 +655,14 @@ void Linear::inference_task(Task const *task,
     assert(bias.domain.get_volume() == static_cast<size_t>(out_dim));
   }
   inference_kernel_wrapper(m,
-                          bc,
-                          input.ptr,
-                          output.ptr,
-                          weight.ptr,
-                          bias.ptr,
-                          in_dim,
-                          out_dim,
-                          batch_size);
+                           bc,
+                           input.ptr,
+                           output.ptr,
+                           weight.ptr,
+                           bias.ptr,
+                           in_dim,
+                           out_dim,
+                           batch_size);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -735,7 +737,7 @@ void Linear::peft_bwd_task(Task const *task,
     return;
   }
   assert(regions.size() == 3);
-  assert(task->regions.size() == 3 );
+  assert(task->regions.size() == 3);
   if (m->quantization_type == DT_NONE) {
     assert(m->input_type[0] == m->weight_type[0]);
   }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index e39b444af4..c02bddc5a6 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -473,7 +473,7 @@ void LoraLinear::inference_task(Task const *task,
     int shard_id = task->index_point.point_data[0];
 
     // Check if output directory exists, and create it if it does not
-    char const *folder_path = "./inference_tensors";
+    char const *folder_path = "./inference_tensors/";
     struct stat st = {0};
     if (stat(folder_path, &st) == -1) {
       // Directory does not exist, create it
@@ -493,15 +493,18 @@ void LoraLinear::inference_task(Task const *task,
         lora_layername.substr(0, found + searchString.length());
 
     // output base filepath, shared by all tensors from the same operator
-    std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
-        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        lora_layername_substr + "_shard-id_" + std::to_string(shard_id);
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    base_filepath +=
+        "_layers_" + std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+        lora_layername_substr + "_shard_" + std::to_string(shard_id);
 
     // save batch config, if passed
     if (bc != nullptr) {
-      bc->save_to_file(base_filepath + "_batch-config");
+      bc->save_to_file(base_filepath + "_batch_config");
     }
 
     std::string filename = base_filepath + "_input_" + std::to_string(0);
@@ -634,7 +637,7 @@ void LoraLinear::peft_bwd_task(Task const *task,
     int shard_id = task->index_point.point_data[0];
 
     // Check if output directory exists, and create it if it does not
-    char const *folder_path = "./inference_tensors";
+    char const *folder_path = "./inference_tensors/";
     struct stat st = {0};
     if (stat(folder_path, &st) == -1) {
       // Directory does not exist, create it
@@ -654,15 +657,18 @@ void LoraLinear::peft_bwd_task(Task const *task,
         lora_layername.substr(0, found + searchString.length());
 
     // output base filepath, shared by all tensors from the same operator
-    std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_bwd-step_" + std::to_string(m->bwd_step) + "_layer-num_" +
-        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        lora_layername_substr + "_shard-id_" + std::to_string(shard_id);
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    base_filepath +=
+        "_layers_" + std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+        lora_layername_substr + "_shard_" + std::to_string(shard_id);
 
     // save batch config, if passed
     if (bc != nullptr) {
-      bc->save_to_file(base_filepath + "_batch-config");
+      bc->save_to_file(base_filepath + "_batch_config");
     }
 
     std::string filename = base_filepath + "_input_" + std::to_string(0);
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 1f87949234..0ba462cde5 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -91,25 +91,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
 template <typename T>
 __global__ void ResidualLayerNormKernel(int64_t N,
                                         float eps,
@@ -127,8 +108,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T residual2_val = (residual2_ptr == nullptr)
                                 ? T(0)
@@ -137,12 +117,10 @@ __global__ void ResidualLayerNormKernel(int64_t N,
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -154,7 +132,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -178,28 +156,51 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                                          T const *beta_ptr,
                                          cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   ResidualLayerNormKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               m->eps,
-                                               input_ptr,
-                                               residual1_ptr,
-                                               residual2_ptr,
-                                               added_output_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               output_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   input_ptr,
+                   residual1_ptr,
+                   residual2_ptr,
+                   added_output_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   output_ptr);
+}
+template <typename T>
+void save_inference_tensors(ResidualLayerNormMeta const *m) {
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
 }
 
 /*static*/
@@ -222,6 +223,33 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    ResidualLayerNorm::inference_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        residual1.get_float_ptr(),
+        m->use_two_residuals ? residual2.get_float_ptr() : nullptr,
+        added_output.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    ResidualLayerNorm::inference_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        residual1.get_half_ptr(),
+        m->use_two_residuals ? residual2.get_half_ptr() : nullptr,
+        added_output.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
+  } else {
+    assert(false && "unsupport datatype in layernorm");
+  }
+
   // save input activation if needed for PEFT
   if (bc->num_active_peft_tokens() > 0) {
     // Check that we have at most one request that requires peft_bwd
@@ -248,7 +276,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
@@ -276,30 +304,14 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     }
   }
 
-  if (m->input_type[0] == DT_FLOAT) {
-    ResidualLayerNorm::inference_kernel<float>(
-        m,
-        input.get_float_ptr(),
-        residual1.get_float_ptr(),
-        m->use_two_residuals ? residual2.get_float_ptr() : nullptr,
-        added_output.get_float_ptr(),
-        output.get_float_ptr(),
-        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
-        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
-        stream);
-  } else if (m->input_type[0] == DT_HALF) {
-    ResidualLayerNorm::inference_kernel<half>(
-        m,
-        input.get_half_ptr(),
-        residual1.get_half_ptr(),
-        m->use_two_residuals ? residual2.get_half_ptr() : nullptr,
-        added_output.get_half_ptr(),
-        output.get_half_ptr(),
-        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
-        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
-        stream);
-  } else {
-    assert(false && "unsupport datatype in layernorm");
+  if (m->inference_debugging) {
+    if (m->input_type[0] == DT_FLOAT) {
+      save_inference_tensors<float>(m);
+    } else if (m->input_type[0] == DT_HALF) {
+      save_inference_tensors<half>(m);
+    } else {
+      assert(false && "unsupport datatype in layernorm");
+    }
   }
 
   if (m->profiling) {
@@ -740,6 +752,34 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
 
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
new file mode 100644
index 0000000000..b0cb5fe428
--- /dev/null
+++ b/tests/peft/alignment/align_test_utils.py
@@ -0,0 +1,240 @@
+import os, re, torch
+import numpy as np
+abs_dirname = os.path.dirname(os.path.abspath(__file__))
+hf_path = os.path.join(abs_dirname, "hf_peft_tensors")
+ff_path = os.path.join(os.path.dirname(os.path.dirname(abs_dirname)), "build", "inference_tensors")
+def print_unique_files_list(dirname):
+    files_list = os.listdir(dirname)
+    for f in sorted(files_list):
+        match = re.search(r'layers.\d+', f)
+        if match:
+            if "layers." in match[0]:
+                layer_num = int(match[0].split(".")[1])
+                if layer_num > 0:
+                    files_list.remove(f)
+            elif "layers_" in match[0]:
+                layer_num = int(match[0].split("_")[1])
+                if layer_num > 0 and layer_num != 100:
+                    files_list.remove(f)
+    return sorted(files_list)
+def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):
+    if not (os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)):
+        print(hf_tensor_filepath, os.path.exists(hf_tensor_filepath))
+        print(ff_tensor_filepath, os.path.exists(ff_tensor_filepath))
+        assert False
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert(len(hf_tensor) == 1)
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor = ff_tensor[:len_hf_tensor]
+    
+    mismatches = []
+    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):
+        print(f"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]
+        print(mismatches)
+        #print(np.nonzero(hf_tensor)[0])
+        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])
+        # print(ff_tensor[36], hf_tensor[36])
+    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert(len(mismatches) <= .05*len_hf_tensor)
+    print("Ok!")
+def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):
+    assert(os.path.exists(hf_tensor_filepath))
+    assert(os.path.exists(ff_tensor1_filepath))
+    assert(os.path.exists(ff_tensor2_filepath))
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert(len(hf_tensor) == 1)
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')
+    ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor1 = ff_tensor1[:len_hf_tensor]
+    ff_tensor2 = ff_tensor2[:len_hf_tensor]
+    ff_tensor = ff_tensor1 - ff_tensor2
+    
+    mismatches = []
+    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):
+        print(f"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]
+        print(mismatches)
+        #print(np.nonzero(hf_tensor)[0])
+        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])
+        # print(ff_tensor[36], hf_tensor[36])
+    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert(len(mismatches) <= .05*len_hf_tensor)
+    print("Ok!")
+def compare_hf_tensors(tensor1_fp, tensor2_fp):
+    assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))
+    hf_tensor1 = torch.load(tensor1_fp)
+    hf_tensor2 = torch.load(tensor2_fp)
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert(len(hf_tensor1) == 1)
+        hf_tensor1 = hf_tensor1[0]
+    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:
+        assert(len(hf_tensor2) == 1)
+        hf_tensor2 = hf_tensor2[0]
+    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)
+    hf_tensor1 = torch.nan_to_num(hf_tensor1)
+    hf_tensor2 = torch.nan_to_num(hf_tensor2)
+    if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):
+        print(f"mismatch between {tensor1_fp} and {tensor2_fp}")
+        print(hf_tensor1)
+        print(hf_tensor2)
+        print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))
+        mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]
+        print(mismatches)
+        assert(False)
+    print("Ok!")
+
+def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):
+    assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))
+    hf_tensor_sum = torch.load(tensor_sum_fp)
+    hf_tensor1 = torch.load(tensor1_fp)
+    hf_tensor2 = torch.load(tensor2_fp)
+    if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:
+        assert(len(hf_tensor_sum) == 1)
+        hf_tensor_sum = hf_tensor_sum[0]
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert(len(hf_tensor1) == 1)
+        hf_tensor1 = hf_tensor1[0]
+    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:
+        assert(len(hf_tensor2) == 1)
+        hf_tensor2 = hf_tensor2[0]
+    assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)
+    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)
+    hf_tensor1 = torch.nan_to_num(hf_tensor1)
+    hf_tensor2 = torch.nan_to_num(hf_tensor2)
+    hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)
+    sum_check_tensor = hf_tensor1 + hf_tensor2
+    if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):
+        print(f"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}")
+        print(tensor_sum_fp)
+        print(sum_check_tensor)
+        print(hf_tensor1)
+        print(hf_tensor2)
+        print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))
+        mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]
+        print(mismatches)
+        assert(False)
+    print("Ok!")
+def check_hf_zero_tensor(hf_tensor_fp):
+    assert(os.path.exists(hf_tensor_fp))
+    hf_tensor1 = torch.load(hf_tensor_fp)
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert(len(hf_tensor1) == 1)
+        hf_tensor1 = hf_tensor1[0]
+    assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)
+def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=""):
+    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert(len(hf_tensor) == 1)
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor = ff_tensor[:len_hf_tensor]
+
+    print(f"{txt} - HF tensor:")
+    print(hf_tensor)
+    print(f"{txt} - FF tensor: ")
+    print(ff_tensor)
+def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):
+    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')
+
+    if (ff_tensor1.shape != ff_tensor2.shape):
+        print(ff_tensor1.shape, ff_tensor2.shape)
+    assert(ff_tensor1.shape == ff_tensor2.shape)
+
+    if max_len > -1:
+        ff_tensor1 = ff_tensor1[:max_len]
+        ff_tensor2 = ff_tensor2[:max_len]
+    
+    mismatches = []
+    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):
+        print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]
+        print(mismatches)
+    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert(len(mismatches) <= .05*len(ff_tensor1))
+    print("Ok!")
+def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):
+    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')
+    minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])
+    ff_tensor1 = ff_tensor1[:minlen]
+    ff_tensor2 = ff_tensor2[:minlen]
+    mismatches = []
+    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):
+        print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]
+        print(mismatches)
+    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert(len(mismatches) <= .05*len(ff_tensor1))
+    print("Ok!")
+def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):
+    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')
+    ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')
+    
+    ff_sum = ff_tensor1 + ff_tensor2
+    assert(ff_tensor1.shape == ff_tensor2.shape)
+    
+    mismatches = []
+    if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):
+        print(f"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(f"Sum Tensor: {ff_tensor_sum}\nActual sum:{ff_sum}")
+        print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]
+        print(mismatches)
+    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert(len(mismatches) <= .05*len(ff_tensor1))
+    print("Ok!")
+def load_ff_tensor(filename, shape):
+    if ff_path not in filename:
+        filename = os.path.join(ff_path, filename)
+    ff_tensor = np.loadtxt(filename, delimiter=',').reshape(shape, order = 'F')
+    return ff_tensor
+def load_hf_tensor(filename):
+    if hf_path not in filename:
+        filename = os.path.join(hf_path, filename)
+    hf_tensor = torch.load(filename)
+    hf_tensor = hf_tensor.detach().cpu().numpy()
+    return hf_tensor
+def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2):
+    assert(hf_tensor.shape == ff_tensor.shape)
+    mismatches = []
+    if not np.allclose(hf_tensor, ff_tensor, atol=tolerance):
+        print(f"mismatch between hf_tensor and ff_tensor")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(hf_tensor, ff_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+        print(mismatches)
+    len_hf_tensor = hf_tensor.flatten().shape[0]
+    assert(len(mismatches) <= .05*len_hf_tensor)
+    print("Ok!")
\ No newline at end of file
diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb
new file mode 100644
index 0000000000..414280cff5
--- /dev/null
+++ b/tests/peft/alignment/llama_alignment_tests.ipynb
@@ -0,0 +1,2039 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch\n",
+    "from align_test_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for i in range(tot_num_layers):\n",
+    "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_RMSNorm_shard_0_output_0\"\n",
+    "    if i > 0:\n",
+    "        ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_norm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "    hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n",
+    "    ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n",
+    "    compare_tensors(hf_attn_out, ff_attn_out)\n",
+    "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    # w1\n",
+    "    hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
+    "    ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n",
+    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n",
+    "    # w3\n",
+    "    hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n",
+    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n",
+    "    compare_tensors(hf_up_proj_out, ff_up_proj_out)\n",
+    "    # w2\n",
+    "    hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n",
+    "    hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n",
+    "    ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n",
+    "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-- LM head --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Final Norm --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "# ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "print(\"-- LM head --\")\n",
+    "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "# compare weights\n",
+    "hf_lm_head_weight = f\"{hf_path}/base_model.model.lm_head.weight\"\n",
+    "ff_lm_head_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0\"\n",
+    "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "# # Manually check the matmul\n",
+    "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n",
+    "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n",
+    "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n",
+    "# print(ff_tensor_out.shape)\n",
+    "# print(ff_weight.shape)\n",
+    "# print(np.matmul(ff_weight, ff_tensor_out))\n",
+    "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n",
+    "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "print(\"-- Final Norm --\")\n",
+    "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "ff_BWD_norm_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0\"\n",
+    "hf_FWD_norm_weight = f\"{hf_path}/norm.weight\"\n",
+    "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
+    "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "class LlamaRotaryEmbedding(nn.Module):\n",
+    "    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.dim = dim\n",
+    "        self.max_position_embeddings = max_position_embeddings\n",
+    "        self.base = base\n",
+    "        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n",
+    "        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n",
+    "\n",
+    "        # Build here to make `torch.jit.trace` work.\n",
+    "        self._set_cos_sin_cache(\n",
+    "            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n",
+    "        )\n",
+    "\n",
+    "    def _set_cos_sin_cache(self, seq_len, device, dtype):\n",
+    "        self.max_seq_len_cached = seq_len\n",
+    "        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n",
+    "\n",
+    "        freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n",
+    "        # Different from paper, but it uses a different permutation in order to obtain the same calculation\n",
+    "        emb = torch.cat((freqs, freqs), dim=-1)\n",
+    "        self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n",
+    "        self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n",
+    "\n",
+    "    def forward(self, x, seq_len=None):\n",
+    "        # x: [bs, num_attention_heads, seq_len, head_size]\n",
+    "        if seq_len > self.max_seq_len_cached:\n",
+    "            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n",
+    "\n",
+    "        return (\n",
+    "            self.cos_cached[:seq_len].to(dtype=x.dtype),\n",
+    "            self.sin_cached[:seq_len].to(dtype=x.dtype),\n",
+    "        )\n",
+    "def rotate_half(x):\n",
+    "    \"\"\"Rotates half the hidden dims of the input.\"\"\"\n",
+    "    x1 = x[..., : x.shape[-1] // 2] # first half\n",
+    "    x2 = x[..., x.shape[-1] // 2 :] # second half\n",
+    "    return torch.cat((x2, -x1), dim=-1)\n",
+    "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n",
+    "    \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n",
+    "\n",
+    "    Args:\n",
+    "        q (`torch.Tensor`): The query tensor.\n",
+    "        k (`torch.Tensor`): The key tensor.\n",
+    "        cos (`torch.Tensor`): The cosine part of the rotary embedding.\n",
+    "        sin (`torch.Tensor`): The sine part of the rotary embedding.\n",
+    "        position_ids (`torch.Tensor`):\n",
+    "            The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n",
+    "            used to pass offsetted position ids when working with a KV-cache.\n",
+    "        unsqueeze_dim (`int`, *optional*, defaults to 1):\n",
+    "            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n",
+    "            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n",
+    "            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n",
+    "            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n",
+    "            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n",
+    "            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n",
+    "    Returns:\n",
+    "        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n",
+    "    \"\"\"\n",
+    "    cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    q_embed = (q * cos) + (rotate_half(q) * sin)\n",
+    "    k_embed = (k * cos) + (rotate_half(k) * sin)\n",
+    "    return q_embed, k_embed\n",
+    "head_dim = 64\n",
+    "max_position_embeddings = 2048\n",
+    "rope_theta=10_000\n",
+    "kv_seq_len = 24\n",
+    "rotary_emb = LlamaRotaryEmbedding(\n",
+    "    head_dim,\n",
+    "    max_position_embeddings=max_position_embeddings,\n",
+    "    base=rope_theta,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43525000e+03 -6.48986062e+05  1.17611250e+05 ...  2.14103413e+01\n",
+      "  1.20965385e+01  3.64246368e+00]\n",
+      "[False  True  True ...  True  True  True]\n",
+      "[   0  162  185  308  339  745  747  820  830  909  933  968 1008 1156\n",
+      " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n",
+      " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n",
+      " 2888 2894 2937 3024]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43525000e+03 -6.48986062e+05  1.17611250e+05 ...  2.14103413e+01\n",
+      "  1.20965385e+01  3.64246368e+00]\n",
+      "[False  True  True ...  True  True  True]\n",
+      "[   0  162  185  308  339  745  747  820  830  909  933  968 1008 1156\n",
+      " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n",
+      " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n",
+      " 2888 2894 2937 3024]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_attention_shard_0_o_proj_in_grad\n",
+      "HF: [ 1.2223595e+06 -2.6348565e+06 -5.0760525e+05 ...  6.8275871e+01\n",
+      " -5.8116108e+01  9.5347488e+01]\n",
+      "FF:[ 1.22235925e+06 -2.63485625e+06 -5.07605000e+05 ...  6.82758865e+01\n",
+      " -5.81161423e+01  9.53475494e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 51  77  95 168 175 232 725]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 1.22235950e+06  9.93645859e+01 -2.82157593e+01 ... -3.94578514e+01\n",
+      "  -1.98409653e+01 -1.33438044e+01]\n",
+      " [-2.63485650e+06 -1.13461929e+02  1.14223976e+02 ...  7.52578735e+01\n",
+      "   1.33362747e+02  6.78501587e+01]\n",
+      " [-5.07605250e+05  4.34111862e+01  8.10619354e+01 ...  4.70537224e+01\n",
+      "   4.02149696e+01  6.98045502e+01]\n",
+      " ...\n",
+      " [ 3.02792250e+06  3.31295319e+02  9.98417091e+00 ...  4.90895653e+01\n",
+      "   9.71413574e+01  6.82758713e+01]\n",
+      " [-3.64456375e+06 -2.43692596e+02 -6.85474396e+00 ... -3.71503868e+01\n",
+      "  -1.34136658e+01 -5.81161079e+01]\n",
+      " [ 3.31921500e+06  2.24193970e+02 -6.64005566e+00 ...  2.11662292e+00\n",
+      "   3.37400856e+01  9.53474884e+01]]\n",
+      "FF:[[ 1.22235925e+06  9.93645630e+01 -2.82157211e+01 ... -3.94577713e+01\n",
+      "  -1.98408775e+01 -1.33438234e+01]\n",
+      " [-2.63485625e+06 -1.13461960e+02  1.14224037e+02 ...  7.52577744e+01\n",
+      "   1.33362701e+02  6.78501205e+01]\n",
+      " [-5.07605000e+05  4.34111404e+01  8.10619278e+01 ...  4.70536804e+01\n",
+      "   4.02149124e+01  6.98045578e+01]\n",
+      " ...\n",
+      " [ 3.02792250e+06  3.31295227e+02  9.98412323e+00 ...  4.90895386e+01\n",
+      "   9.71413727e+01  6.82758865e+01]\n",
+      " [-3.64456400e+06 -2.43692627e+02 -6.85472488e+00 ... -3.71504822e+01\n",
+      "  -1.34137001e+01 -5.81161423e+01]\n",
+      " [ 3.31921500e+06  2.24193970e+02 -6.64004517e+00 ...  2.11670875e+00\n",
+      "   3.37400322e+01  9.53475494e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 51  77  95 168 175 232 725]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 1.2223588e+06 -2.6348530e+06 -5.0760291e+05 ...  3.0279325e+06\n",
+      "  -3.6445672e+06  3.3192180e+06]\n",
+      " [-4.2496326e+02  1.1576636e+03  9.8397858e+02 ...  1.6480791e+03\n",
+      "  -5.9697235e+02  6.2627173e+02]\n",
+      " [-2.2012039e+01  6.6097900e+01  3.9933994e+01 ...  5.7103355e+01\n",
+      "  -1.5968766e+01  3.6536639e+00]\n",
+      " ...\n",
+      " [-1.2302110e+00  5.3052688e+00  2.1982718e+00 ...  1.3990868e+00\n",
+      "  -5.5132383e-01  4.8985812e-01]\n",
+      " [-1.0771493e+00  6.9571300e+00  2.7373023e+00 ...  4.9663010e+00\n",
+      "  -9.9705428e-01  2.1829298e+00]\n",
+      " [-5.9534687e-01  3.0272012e+00  3.1143982e+00 ...  2.4072502e+00\n",
+      "  -2.0490403e+00  3.3617332e+00]]\n",
+      "FF:[[ 1.22235850e+06 -2.63485275e+06 -5.07602656e+05 ...  3.02793250e+06\n",
+      "  -3.64456750e+06  3.31921800e+06]\n",
+      " [-4.24962585e+02  1.15766296e+03  9.83978577e+02 ...  1.64807898e+03\n",
+      "  -5.96972351e+02  6.26271790e+02]\n",
+      " [-2.20120354e+01  6.60979462e+01  3.99340210e+01 ...  5.71033745e+01\n",
+      "  -1.59687757e+01  3.65366316e+00]\n",
+      " ...\n",
+      " [-1.23020661e+00  5.30526114e+00  2.19826817e+00 ...  1.39908671e+00\n",
+      "  -5.51325083e-01  4.89858717e-01]\n",
+      " [-1.07714510e+00  6.95712519e+00  2.73729825e+00 ...  4.96630049e+00\n",
+      "  -9.97055829e-01  2.18292713e+00]\n",
+      " [-5.95347941e-01  3.02720070e+00  3.11439991e+00 ...  2.40725493e+00\n",
+      "  -2.04904509e+00  3.36174107e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "7.4363425925925934% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n",
+      "   2.38160934e+01  3.15938339e+01]\n",
+      " [-9.55138900e+06  6.71377197e+02  2.06871887e+02 ... -3.86393509e+01\n",
+      "   2.14816055e+01 -6.58599396e+01]\n",
+      " [ 1.14522670e+07  2.19898975e+03 -6.89673233e+00 ...  9.51593590e+00\n",
+      "  -1.68612709e+01  6.02474251e+01]\n",
+      " ...\n",
+      " [ 2.10891925e+06  3.78648706e+03  1.02701221e+03 ...  3.59794388e+01\n",
+      "   5.03902206e+01  4.19777756e+01]\n",
+      " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ...  9.36443710e+00\n",
+      "   3.84094887e+01 -7.51948738e+00]\n",
+      " [ 7.39155050e+06  1.11731885e+03  3.38369843e+02 ...  3.70399475e+01\n",
+      "   1.77629051e+01  9.76780853e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.52523600e+06 -1.27625293e+03 -4.39336700e+01 ... -3.34414597e+01\n",
+      "   2.38162422e+01  3.15938187e+01]\n",
+      " [-9.55138900e+06  6.71377319e+02  2.06871674e+02 ... -3.86393127e+01\n",
+      "   2.14817867e+01 -6.58600464e+01]\n",
+      " [ 1.14522660e+07  2.19898950e+03 -6.89660644e+00 ...  9.51594448e+00\n",
+      "  -1.68611774e+01  6.02474518e+01]\n",
+      " ...\n",
+      " [ 2.10891850e+06  3.78648633e+03  1.02701196e+03 ...  3.59794846e+01\n",
+      "   5.03901253e+01  4.19777679e+01]\n",
+      " [ 2.11695400e+06 -2.36282440e+02 -1.08002762e+02 ...  9.36448860e+00\n",
+      "   3.84096107e+01 -7.51954842e+00]\n",
+      " [ 7.39155000e+06  1.11731921e+03  3.38370087e+02 ...  3.70398293e+01\n",
+      "   1.77627277e+01  9.76782227e+01]]\n",
+      "6.011284722222222% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  4.9057606e+01\n",
+      "  4.7770348e+01  5.8564331e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  4.90575981e+01\n",
+      "  4.77703362e+01  5.85643845e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   88   138   187   203   232   242   493   657   750   900  1198  1249\n",
+      "  1287  1305  1414  1428  1490  1588  1600  1612  1625  1657  1676  1677\n",
+      "  1692  1694  1724  1730  1772  1822  1825  1838  1853  1910  2035  2043\n",
+      "  2053  2059  2073  2078  2123  2145  2214  2238  2241  2285  2292  2389\n",
+      "  2542  2582  2589  2599  2674  2688  2711  2840  2856  2961  2963  2980\n",
+      "  3064  3176  3192  3255  3262  3278  3338  3341  3412  3419  3492  3590\n",
+      "  3624  3646  3657  3807  3840  3842  3846  3883  3887  4005  4049  4071\n",
+      "  4076  4077  4079  4137  4142  4192  4193  4202  4218  4224  4273  4355\n",
+      "  4358  4381  4401  4435  4469  4499  4514  4546  4598  4619  4747  4846\n",
+      "  4872  4916  4952  4966  5016  5067  5107  5112  5116  5194  5225  5350\n",
+      "  5364  5403  5515  5537  5550  5578  5650  5653  5654  5736  5751  5837\n",
+      "  5870  5881  5972  5998  6006  6051  6061  6107  6129  6204  6236  6292\n",
+      "  6296  6327  6382  6393  6403  6420  6424  6436  6468  6542  6599  6675\n",
+      "  6681  6711  6723  6767  6823  6914  6983  7047  7064  7133  7167  7197\n",
+      "  7198  7209  7528  7537  7538  7686  7850  7855  7889  7910  7919  7927\n",
+      "  7937  7939  8089  8101  8157  8169  8175  8223  8292  8304  8306  8342\n",
+      "  8351  8414  8475  8500  8543  8558  8609  8656  8687  8704  8724  8726\n",
+      "  8777  8816  8826  8871  8904  8934  8983  9012  9033  9043  9068  9093\n",
+      "  9125  9133  9144  9151  9154  9217  9222  9320  9335  9367  9398  9421\n",
+      "  9434  9521  9547  9633  9702  9726  9763  9949 10018 10053 10062 10079\n",
+      " 10137 10149 10203 10261 10269 10292 10312 10332 10471 10478 10514 10596\n",
+      " 10645 10676 10678 10781 10795 10810 10833 10891 10904 10935 10957 10977\n",
+      " 10982 11028 11095 11172 11223 11251 11283 11303 11319 11374 11392 11437\n",
+      " 11486 11627 11678 11750 11759 11979 11996 12019 12126 12237 12262 12288\n",
+      " 12303 12309 12315 12387 12543 12569 12613 12648 12786 12852 12866 12879\n",
+      " 12947 12963 13037 13058 13261 13284 13312 13394 13399 13427 13526 13527\n",
+      " 13592 13695 13741 13752 13775 13803 13812 13866 13902 14049 14170 14241\n",
+      " 14354 14382 14426 14451 14455 14486 14502 14582 14820 14934 14961 14976\n",
+      " 15000 15003 15014 15077 15096 15108 15135 15148 15165 15219 15232 15290\n",
+      " 15339 15345 15819 15945 15994 16077 16135 16218 16231 16233 16239 16243\n",
+      " 16295 16311 16339 16356 16366 16417 16456 16498 16502 16503 16506 16547\n",
+      " 16585 16603 16611 16633 16661 16683 16704 16710 16723 16724 16745 16754\n",
+      " 16773 16787 16789 16818 16829 16833 16913 16933 17025 17033 17037 17055\n",
+      " 17084 17098 17109 17176 17225 17240 17292 17294 17339 17390 17427 17437\n",
+      " 17579 17626 17630 17654 17719 17902 17912 18023 18025 18124 18203 18339\n",
+      " 18344]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  4.9057606e+01\n",
+      "  4.7770348e+01  5.8564331e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  4.90575981e+01\n",
+      "  4.77703362e+01  5.85643845e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 88 138 187 203 232 242 493 657 750]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 4.7819588e+07  3.8833264e+07  4.7789860e+07 ...  1.0804405e+00\n",
+      "  2.7186510e-01 -2.9918199e+00]\n",
+      "FF:[ 4.78195960e+07  3.88332640e+07  4.77898600e+07 ...  1.08044124e+00\n",
+      "  2.71864563e-01 -2.99182224e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 109  211  312  422  590  832  835 1016 1053 1076 1268 1353 1374 1693\n",
+      " 1701 1710 1722 1832 1954 1965 1997 2076 2124 2146 2378 2520 2605 2624\n",
+      " 2967 3007 3015]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 3.3558659e+09  1.3409817e+10 -1.4671958e+10 ...  7.2100967e+01\n",
+      "  6.5979071e+00 -2.1230124e+01]\n",
+      "FF:[ 3.35586406e+09  1.34098166e+10 -1.46719611e+10 ...  7.21009750e+01\n",
+      "  6.59790993e+00 -2.12301121e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   4   95  111  163  179  191  279  305  363  406  447  487  489  494\n",
+      "  517  617  703  713  735  796  805  819  826  858  882  959  964  967\n",
+      "  986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n",
+      " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n",
+      " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n",
+      " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 3.3558659e+09  1.3409817e+10 -1.4671958e+10 ...  7.2100967e+01\n",
+      "  6.5979071e+00 -2.1230124e+01]\n",
+      "FF:[ 3.35586406e+09  1.34098166e+10 -1.46719611e+10 ...  7.21009750e+01\n",
+      "  6.59790993e+00 -2.12301121e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   4   95  111  163  179  191  279  305  363  406  447  487  489  494\n",
+      "  517  617  703  713  735  796  805  819  826  858  882  959  964  967\n",
+      "  986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n",
+      " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n",
+      " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n",
+      " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  9.3464905e+01\n",
+      "  7.5613129e+01  7.6598846e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  9.34649200e+01\n",
+      "  7.56131058e+01  7.65989227e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 88 138 187 203 232 242 493 657 750]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_o_proj_in_grad\n",
+      "HF: [-9.4470595e+09 -7.3870331e+09  1.2659395e+10 ... -2.8149616e+01\n",
+      "  1.7019112e+02 -7.7236428e+00]\n",
+      "FF:[-9.44706150e+09 -7.38703309e+09  1.26593966e+10 ... -2.81496239e+01\n",
+      "  1.70191177e+02 -7.72364044e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 11  98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-9.44705946e+09  2.28078384e+01  3.18554016e+02 ...  1.17267204e+02\n",
+      "   2.06791725e+01  1.13138672e+02]\n",
+      " [-7.38703309e+09 -7.36898804e+00  7.93705673e+01 ...  2.04039650e+01\n",
+      "   3.18331490e+01  5.44241562e+01]\n",
+      " [ 1.26593946e+10  1.77534424e+02 -2.97175941e+01 ...  1.16716766e+01\n",
+      "   7.70214081e+01  2.81902496e+02]\n",
+      " ...\n",
+      " [ 4.51210445e+10  3.63867615e+02 -8.04915466e+01 ... -1.34332123e+02\n",
+      "  -1.22151840e+02 -2.81496162e+01]\n",
+      " [-1.39591885e+10  1.59216873e+02  6.11343079e+01 ...  1.56675262e+02\n",
+      "   9.68551483e+01  1.70191116e+02]\n",
+      " [-1.29442345e+10 -2.39441833e+02  2.73647644e+02 ... -4.41197014e+01\n",
+      "  -9.48526230e+01 -7.72364283e+00]]\n",
+      "FF:[[-9.44706150e+09  2.28079376e+01  3.18553864e+02 ...  1.17267227e+02\n",
+      "   2.06791859e+01  1.13138741e+02]\n",
+      " [-7.38703309e+09 -7.36921692e+00  7.93703690e+01 ...  2.04038925e+01\n",
+      "   3.18332825e+01  5.44241333e+01]\n",
+      " [ 1.26593966e+10  1.77534454e+02 -2.97174206e+01 ...  1.16717224e+01\n",
+      "   7.70213699e+01  2.81902618e+02]\n",
+      " ...\n",
+      " [ 4.51210527e+10  3.63867554e+02 -8.04915695e+01 ... -1.34332092e+02\n",
+      "  -1.22151901e+02 -2.81496239e+01]\n",
+      " [-1.39591834e+10  1.59216995e+02  6.11343040e+01 ...  1.56675293e+02\n",
+      "   9.68551559e+01  1.70191177e+02]\n",
+      " [-1.29442304e+10 -2.39441772e+02  2.73647644e+02 ... -4.41196594e+01\n",
+      "  -9.48526916e+01 -7.72364044e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 11  98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-9.44705946e+09 -7.38703309e+09  1.26593946e+10 ...  4.51210445e+10\n",
+      "  -1.39591885e+10 -1.29442345e+10]\n",
+      " [ 1.14852783e+03  4.39543152e+02  1.07877356e+03 ... -2.42416113e+03\n",
+      "   2.64504834e+03  4.68633453e+02]\n",
+      " [ 5.72417107e+01  4.12602806e+01 -2.27319489e+01 ... -3.40788422e+01\n",
+      "   4.86237946e+01  1.25752163e+01]\n",
+      " ...\n",
+      " [ 6.76848269e+00  8.23165894e+00  2.10253639e+01 ... -3.19590777e-01\n",
+      "   3.68098617e-01 -1.95310101e-01]\n",
+      " [ 4.08574820e+00  5.33035660e+00  1.41003275e+01 ... -1.35607815e+00\n",
+      "   4.06074905e+00 -7.67630756e-01]\n",
+      " [ 2.03186665e+01  9.77407932e+00  5.06271019e+01 ... -6.80029154e-01\n",
+      "   4.11142111e+00 -1.86585218e-01]]\n",
+      "FF:[[-9.44706150e+09 -7.38703309e+09  1.26593966e+10 ...  4.51210527e+10\n",
+      "  -1.39591834e+10 -1.29442304e+10]\n",
+      " [ 1.14852808e+03  4.39542755e+02  1.07877344e+03 ... -2.42416138e+03\n",
+      "   2.64504932e+03  4.68633698e+02]\n",
+      " [ 5.72415771e+01  4.12602005e+01 -2.27318707e+01 ... -3.40787392e+01\n",
+      "   4.86236725e+01  1.25752039e+01]\n",
+      " ...\n",
+      " [ 6.76847696e+00  8.23167515e+00  2.10253181e+01 ... -3.19590837e-01\n",
+      "   3.68098557e-01 -1.95310280e-01]\n",
+      " [ 4.08574867e+00  5.33037567e+00  1.41003180e+01 ... -1.35607564e+00\n",
+      "   4.06074095e+00 -7.67629445e-01]\n",
+      " [ 2.03186874e+01  9.77407932e+00  5.06271439e+01 ... -6.80029511e-01\n",
+      "   4.11142349e+00 -1.86585203e-01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "6.640625% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-5.1505955e+10 -4.7166772e+03 -1.3288132e+02 ... -3.0123844e+00\n",
+      "  -5.5234032e+01  6.0299168e+00]\n",
+      " [-3.5960029e+10 -5.3263096e+03 -1.9434322e+02 ... -5.6601189e+01\n",
+      "  -1.0787462e+02 -6.0718418e+01]\n",
+      " [ 4.8131662e+10  1.1578307e+04  1.7744476e+02 ... -5.6970375e+01\n",
+      "  -1.7497168e+01 -7.2297249e+00]\n",
+      " ...\n",
+      " [-9.0346426e+08  6.4752144e+03  3.2408417e+02 ...  6.1075470e+01\n",
+      "   8.5356834e+01  8.3221588e+01]\n",
+      " [-5.0754217e+09 -2.2929268e+03 -1.4913528e+02 ...  8.6639397e+01\n",
+      "   1.1156468e+02  1.0695674e+02]\n",
+      " [ 5.5844772e+09  3.0225920e+03 -6.3137859e+01 ... -6.5270996e+01\n",
+      "   8.2730171e+01 -1.0107367e+02]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-5.15059548e+10 -4.71667773e+03 -1.32881012e+02 ... -3.01225996e+00\n",
+      "  -5.52339973e+01  6.02991867e+00]\n",
+      " [-3.59600292e+10 -5.32630957e+03 -1.94343079e+02 ... -5.66010437e+01\n",
+      "  -1.07874649e+02 -6.07182846e+01]\n",
+      " [ 4.81316659e+10  1.15783076e+04  1.77444519e+02 ... -5.69703102e+01\n",
+      "  -1.74972763e+01 -7.22990799e+00]\n",
+      " ...\n",
+      " [-9.03455232e+08  6.47521484e+03  3.24083832e+02 ...  6.10753632e+01\n",
+      "   8.53567886e+01  8.32217255e+01]\n",
+      " [-5.07543654e+09 -2.29292749e+03 -1.49135025e+02 ...  8.66392517e+01\n",
+      "   1.11564789e+02  1.06956917e+02]\n",
+      " [ 5.58446592e+09  3.02259229e+03 -6.31376152e+01 ... -6.52709351e+01\n",
+      "   8.27302551e+01 -1.01073837e+02]]\n",
+      "7.025824652777778% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-6.33203254e+13 -4.43651289e+13  6.35509366e+13 ...  1.08435585e+02\n",
+      "  9.42303467e+01  5.89958420e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  1.08435623e+02\n",
+      "  9.42303467e+01  5.89958954e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   26    51    66    85   259   262   272   296   298   329   392   415\n",
+      "   428   482   492   514   526   531   671   731   763   777   893   927\n",
+      "   984  1105  1184  1206  1418  1541  1548  1572  1577  1613  1619  1643\n",
+      "  1658  1661  1691  1701  1706  1726  1757  1784  1815  1833  1849  1856\n",
+      "  1880  1891  1921  1956  1969  2012  2021  2028  2030  2059  2065  2144\n",
+      "  2149  2183  2210  2238  2292  2342  2357  2384  2414  2495  2531  2565\n",
+      "  2597  2662  2713  2781  2821  2829  2877  2904  2921  2927  2962  2973\n",
+      "  3044  3066  3094  3100  3106  3159  3193  3251  3377  3389  3397  3427\n",
+      "  3436  3570  3594  3703  3729  3770  3772  3780  3811  3840  3842  3860\n",
+      "  3907  3920  3929  3946  3955  3969  4005  4009  4034  4048  4077  4089\n",
+      "  4104  4129  4134  4178  4202  4212  4219  4239  4245  4256  4273  4373\n",
+      "  4407  4463  4464  4465  4481  4511  4537  4541  4543  4549  4597  4599\n",
+      "  4633  4759  4760  4789  4846  4884  4901  4930  4954  4971  4993  5024\n",
+      "  5030  5041  5050  5116  5130  5163  5207  5224  5282  5313  5322  5349\n",
+      "  5363  5403  5410  5412  5454  5543  5581  5590  5654  5673  5784  5821\n",
+      "  5849  5880  5911  5917  5982  6000  6062  6165  6178  6193  6200  6272\n",
+      "  6322  6351  6366  6376  6380  6382  6393  6412  6420  6430  6433  6446\n",
+      "  6476  6482  6488  6490  6519  6527  6540  6556  6563  6567  6577  6600\n",
+      "  6619  6680  6709  6735  6768  6777  6780  6823  6825  6826  6830  6863\n",
+      "  6880  6912  6988  7006  7030  7071  7077  7102  7123  7244  7264  7367\n",
+      "  7389  7390  7434  7451  7452  7455  7505  7532  7539  7589  7598  7620\n",
+      "  7651  7653  7659  7709  7714  7740  7751  7759  7803  7808  7820  7917\n",
+      "  7923  7926  7949  7962  7966  7978  8002  8004  8040  8050  8052  8068\n",
+      "  8180  8223  8250  8253  8265  8341  8344  8375  8376  8386  8449  8468\n",
+      "  8501  8509  8522  8535  8585  8590  8593  8642  8657  8674  8687  8707\n",
+      "  8714  8726  8729  8737  8756  8769  8801  8846  8850  8865  8907  8998\n",
+      "  9018  9043  9059  9066  9083  9093  9098  9130  9131  9165  9189  9216\n",
+      "  9285  9337  9368  9526  9539  9563  9620  9659  9723  9793  9804  9817\n",
+      "  9820  9827  9908  9995 10053 10128 10135 10143 10205 10253 10274 10292\n",
+      " 10300 10311 10327 10356 10406 10441 10491 10494 10551 10562 10563 10634\n",
+      " 10649 10674 10710 10734 10821 10831 10833 10838 10845 10911 10966 10981\n",
+      " 10988 10990 10998 11008 11044 11049 11100 11127 11141 11197 11250 11269\n",
+      " 11285 11308 11361 11383 11437 11460 11494 11502 11511 11522 11546 11557\n",
+      " 11564 11588 11649 11658 11671 11674 11703 11729 11749 11759 11832 11892\n",
+      " 11979 11988 12000 12038 12063 12078 12107 12119 12165 12259 12269 12270\n",
+      " 12347 12369 12386 12415 12475 12518 12566 12569 12574 12652 12693 12792\n",
+      " 12833 12834 12852 12872 12900 12946 13117 13121 13124 13321 13345 13357\n",
+      " 13427 13431 13446 13473 13526 13635 13638 13662 13706 13733 13803 13807\n",
+      " 13852 13882 13912 13924 13962 13969 13986 14023 14036 14046 14085 14110\n",
+      " 14130 14141 14175 14183 14191 14220 14222 14223 14285 14310 14331 14336\n",
+      " 14354 14375 14425 14427 14451 14482 14493 14516 14560 14563 14581 14623\n",
+      " 14671 14677 14679 14680 14685 14688 14742 14799 14860 14868 14870 14872\n",
+      " 14900 14909 14916 14940 14964 14991 15003 15023 15027 15033 15038 15051\n",
+      " 15086 15100 15184 15214 15232 15290 15352 15363 15365 15407 15433 15451\n",
+      " 15522 15577 15707 15720 15725 15739 15830 15837 15875 15937 15965 15985\n",
+      " 16017 16054 16113 16136 16142 16169 16191 16232 16238 16250 16268 16282\n",
+      " 16285 16290 16295 16304 16327 16334 16353 16356 16363 16382 16403 16407\n",
+      " 16408 16409 16458 16459 16495 16497 16499 16500 16516 16532 16595 16603\n",
+      " 16611 16657 16678 16680 16695 16701 16704 16754 16768 16807 16818 16856\n",
+      " 16870 16951 16971 16986 16989 16992 17048 17134 17181 17208 17217 17236\n",
+      " 17243 17319 17363 17398 17448 17471 17497 17557 17646 17654 17659 17692\n",
+      " 17754 17947 17957 17969 17975 18029 18128 18146 18196 18206 18207 18250\n",
+      " 18265 18313 18406]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-6.33203254e+13 -4.43651289e+13  6.35509366e+13 ...  1.08435585e+02\n",
+      "  9.42303467e+01  5.89958420e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  1.08435623e+02\n",
+      "  9.42303467e+01  5.89958954e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 26  51  66  85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n",
+      " 671 731 763]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 5.0590863e+10  3.7823513e+11 -5.0394451e+11 ... -5.5814421e-01\n",
+      "  2.2970559e-01 -1.2293311e+00]\n",
+      "FF:[ 5.05906831e+10  3.78235290e+11 -5.03944544e+11 ... -5.58144033e-01\n",
+      "  2.29705781e-01 -1.22933090e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 189  254  317  418  515  546  577  634  636  675  712  808 1011 1030\n",
+      " 1080 1091 1132 1168 1254 1265 1285 1287 1354 1381 1427 1459 1506 1620\n",
+      " 1654 1752 1887 1897 1900 1937 1981 1985 1986 2003 2029 2152 2181 2295\n",
+      " 2395 2426 2445 2673 2687 2859 2947 2977 3037]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n",
+      "  1.2655228e+01  7.1020460e+00]\n",
+      "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n",
+      "  1.26551876e+01  7.10206795e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   9   49  113  174  243  267  271  288  323  335  397  399  438  439\n",
+      "  457  475  506  568  569  652  680  689  715  735  739  758  766  777\n",
+      "  785  837  842  852  865  884  893  919  930  932  936  939  957 1018\n",
+      " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n",
+      " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n",
+      " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n",
+      " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n",
+      " 2932 2948 3018 3034 3066]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n",
+      "  1.2655228e+01  7.1020460e+00]\n",
+      "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n",
+      "  1.26551876e+01  7.10206795e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   9   49  113  174  243  267  271  288  323  335  397  399  438  439\n",
+      "  457  475  506  568  569  652  680  689  715  735  739  758  766  777\n",
+      "  785  837  842  852  865  884  893  919  930  932  936  939  957 1018\n",
+      " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n",
+      " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n",
+      " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n",
+      " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n",
+      " 2932 2948 3018 3034 3066]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_output_0\n",
+      "HF: [-6.3320325e+13 -4.4365129e+13  6.3550937e+13 ...  7.2449814e+01\n",
+      "  8.6617142e+01  8.3981407e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  7.24498901e+01\n",
+      "  8.66170959e+01  8.39814606e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 26  51  66  85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n",
+      " 671 731 763]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_o_proj_in_grad\n",
+      "HF: [ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ...  2.5297220e+02\n",
+      " -8.1722275e+01 -7.0014725e+01]\n",
+      "FF:[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ...  2.52972260e+02\n",
+      " -8.17222137e+01 -7.00146637e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  6  36  43  55  60  82 101 110 117 217 221 229 236 256 289 392 421 429\n",
+      " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 7.28854608e+13  6.37500977e+02  2.96775421e+02 ...  8.35403061e+01\n",
+      "   1.72460327e+02  2.90482426e+01]\n",
+      " [-6.08358210e+13 -5.23222847e+01 -2.34542664e+02 ... -1.87500763e+01\n",
+      "  -8.99429398e+01  8.64021378e+01]\n",
+      " [-7.97326117e+13 -4.24736328e+02 -1.82208099e+02 ...  3.21808720e+00\n",
+      "  -5.87415466e+01 -2.08511108e+02]\n",
+      " ...\n",
+      " [-1.13411917e+14 -3.48418640e+02  1.52205795e+02 ...  1.51519928e+02\n",
+      "   2.45651031e+02  2.52972198e+02]\n",
+      " [-3.75985275e+12  2.39696625e+02  1.51989685e+02 ... -2.85605354e+01\n",
+      "  -1.79121232e+00 -8.17222748e+01]\n",
+      " [ 1.11016038e+14 -1.96372967e+01 -1.27668396e+02 ...  3.35008011e+01\n",
+      "  -7.46116943e+01 -7.00147247e+01]]\n",
+      "FF:[[ 7.28854608e+13  6.37500977e+02  2.96775513e+02 ...  8.35403976e+01\n",
+      "   1.72460068e+02  2.90483646e+01]\n",
+      " [-6.08357832e+13 -5.23225098e+01 -2.34542755e+02 ... -1.87501526e+01\n",
+      "  -8.99431992e+01  8.64022217e+01]\n",
+      " [-7.97326201e+13 -4.24736572e+02 -1.82207733e+02 ...  3.21793270e+00\n",
+      "  -5.87416573e+01 -2.08511139e+02]\n",
+      " ...\n",
+      " [-1.13411925e+14 -3.48418640e+02  1.52205902e+02 ...  1.51519714e+02\n",
+      "   2.45650864e+02  2.52972260e+02]\n",
+      " [-3.75988630e+12  2.39696686e+02  1.51989319e+02 ... -2.85606136e+01\n",
+      "  -1.79138493e+00 -8.17222137e+01]\n",
+      " [ 1.11016046e+14 -1.96372318e+01 -1.27668480e+02 ...  3.35009079e+01\n",
+      "  -7.46116791e+01 -7.00146637e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[  6  36  43  55  60  82 101 110 117 217 221 229 236 256 289 392 421 429\n",
+      " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... -1.1341192e+14\n",
+      "  -3.7598527e+12  1.1101604e+14]\n",
+      " [ 3.3241980e+03 -6.3044128e+02 -3.0447307e+03 ...  3.0137921e+02\n",
+      "   3.8262988e+02 -4.2889914e+02]\n",
+      " [ 3.5639046e+01 -1.6155790e+01 -2.4461178e+01 ...  2.7450909e+02\n",
+      "   1.6181946e+02 -2.5407137e+02]\n",
+      " ...\n",
+      " [ 4.6487908e+00 -9.6633381e-01 -2.7078497e-01 ...  3.6374569e+01\n",
+      "  -1.7563061e+00 -7.1206141e+00]\n",
+      " [ 1.8901447e+00  8.9006472e-01 -4.3125896e+00 ...  2.6014965e+01\n",
+      "  -3.7720141e-01 -7.8855257e+00]\n",
+      " [ 1.9513500e+00  5.8041654e+00 -1.4006979e+01 ...  7.2743622e+01\n",
+      "  -2.3499712e+01 -2.0133139e+01]]\n",
+      "FF:[[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... -1.13411925e+14\n",
+      "  -3.75988630e+12  1.11016046e+14]\n",
+      " [ 3.32419922e+03 -6.30442505e+02 -3.04472998e+03 ...  3.01379364e+02\n",
+      "   3.82629669e+02 -4.28898712e+02]\n",
+      " [ 3.56390572e+01 -1.61558037e+01 -2.44611683e+01 ...  2.74509308e+02\n",
+      "   1.61819229e+02 -2.54071594e+02]\n",
+      " ...\n",
+      " [ 4.64879847e+00 -9.66338813e-01 -2.70792574e-01 ...  3.63745117e+01\n",
+      "  -1.75632846e+00 -7.12060070e+00]\n",
+      " [ 1.89013767e+00  8.90062451e-01 -4.31257772e+00 ...  2.60149212e+01\n",
+      "  -3.77217919e-01 -7.88551569e+00]\n",
+      " [ 1.95135939e+00  5.80417490e+00 -1.40069904e+01 ...  7.27435226e+01\n",
+      "  -2.34996586e+01 -2.01330910e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "7.609953703703703% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-1.17282076e+14 -2.12461621e+03  8.80099030e+01 ...  4.34470520e+01\n",
+      "   7.55885468e+01 -2.88791332e+01]\n",
+      " [-2.07757936e+14 -3.81796265e+02 -2.33774780e+02 ...  8.11984329e+01\n",
+      "  -4.41825638e+01  7.35064125e+00]\n",
+      " [ 4.11484165e+13  2.50572113e+02  1.91601822e+02 ...  1.00269365e+01\n",
+      "  -3.41638985e+01  1.20433075e+02]\n",
+      " ...\n",
+      " [ 7.95562329e+13  1.55007373e+03  1.70351212e+02 ... -1.80320053e+01\n",
+      "   8.77533417e+01  2.14678173e+01]\n",
+      " [-1.86546485e+14 -5.18847070e+03 -3.34331085e+02 ...  2.51586838e+01\n",
+      "  -4.06135368e+01 -6.27860641e+00]\n",
+      " [ 1.89751705e+14 -3.09853809e+03 -1.18278351e+01 ... -1.24640663e+02\n",
+      "   1.59719009e+01 -6.47173615e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-1.17282034e+14 -2.12461694e+03  8.80101547e+01 ...  4.34468918e+01\n",
+      "   7.55886002e+01 -2.88791542e+01]\n",
+      " [-2.07757920e+14 -3.81795776e+02 -2.33774765e+02 ...  8.11985397e+01\n",
+      "  -4.41825829e+01  7.35066986e+00]\n",
+      " [ 4.11484543e+13  2.50570099e+02  1.91601196e+02 ...  1.00270777e+01\n",
+      "  -3.41638451e+01  1.20433121e+02]\n",
+      " ...\n",
+      " [ 7.95562413e+13  1.55007288e+03  1.70350784e+02 ... -1.80321960e+01\n",
+      "   8.77533112e+01  2.14678249e+01]\n",
+      " [-1.86546469e+14 -5.18847070e+03 -3.34331268e+02 ...  2.51588135e+01\n",
+      "  -4.06132622e+01 -6.27861023e+00]\n",
+      " [ 1.89751521e+14 -3.09853711e+03 -1.18275299e+01 ... -1.24640862e+02\n",
+      "   1.59719791e+01 -6.47173767e+01]]\n",
+      "7.530381944444445% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  7.7873253e+01\n",
+      "  8.6085976e+01  6.8200005e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  7.78733292e+01\n",
+      "  8.60859299e+01  6.82000580e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[    3    24    66    71    94    95   124   134   141   150   163   181\n",
+      "   226   261   284   318   320   378   382   385   391   395   403   422\n",
+      "   434   495   515   523   524   549   579   610   644   710   764   772\n",
+      "   870   984   987  1045  1249  1330  1362  1489  1517  1550  1556  1588\n",
+      "  1595  1659  1672  1684  1689  1768  1792  1799  1808  1818  1842  1871\n",
+      "  1889  1899  1910  1915  1925  1936  1993  1997  2033  2041  2059  2062\n",
+      "  2066  2098  2111  2124  2129  2130  2146  2153  2159  2166  2197  2206\n",
+      "  2210  2212  2222  2234  2237  2320  2321  2357  2359  2362  2385  2428\n",
+      "  2518  2539  2553  2568  2598  2683  2689  2694  2711  2714  2733  2787\n",
+      "  2788  2795  2811  2815  2853  2881  2890  2917  2981  2997  3021  3037\n",
+      "  3089  3149  3163  3191  3196  3217  3225  3248  3277  3287  3292  3305\n",
+      "  3327  3361  3385  3402  3417  3425  3456  3479  3516  3521  3528  3555\n",
+      "  3587  3599  3608  3684  3702  3733  3770  3779  3819  3822  3823  3898\n",
+      "  3921  3942  3950  4012  4053  4077  4086  4091  4139  4185  4198  4225\n",
+      "  4241  4296  4347  4349  4368  4403  4407  4418  4453  4471  4472  4473\n",
+      "  4494  4537  4549  4555  4558  4598  4623  4648  4666  4698  4729  4782\n",
+      "  4848  4866  4886  4943  4959  5008  5010  5012  5057  5079  5177  5178\n",
+      "  5186  5211  5271  5281  5296  5313  5328  5356  5364  5409  5429  5440\n",
+      "  5453  5455  5457  5476  5529  5563  5591  5621  5625  5631  5654  5661\n",
+      "  5692  5705  5720  5740  5751  5758  5787  5799  5813  5835  5836  5867\n",
+      "  5872  5893  5953  5974  5980  5982  6000  6055  6082  6086  6102  6107\n",
+      "  6123  6159  6172  6193  6220  6230  6231  6263  6286  6297  6362  6396\n",
+      "  6401  6430  6436  6485  6497  6499  6502  6510  6537  6554  6555  6563\n",
+      "  6564  6579  6586  6598  6615  6625  6626  6649  6651  6661  6754  6764\n",
+      "  6776  6852  6863  6874  6883  6892  6913  6945  6969  7036  7057  7066\n",
+      "  7082  7138  7147  7150  7157  7197  7202  7231  7234  7235  7240  7270\n",
+      "  7278  7287  7322  7327  7345  7348  7361  7390  7402  7490  7539  7573\n",
+      "  7610  7714  7721  7758  7794  7812  7827  7829  7837  7839  7882  7894\n",
+      "  7943  7948  7952  7969  7975  7996  8024  8027  8037  8043  8055  8078\n",
+      "  8079  8088  8090  8095  8154  8258  8264  8283  8297  8313  8329  8336\n",
+      "  8359  8361  8376  8383  8416  8421  8428  8454  8475  8502  8521  8613\n",
+      "  8642  8653  8696  8756  8764  8777  8791  8837  8849  8859  8878  8955\n",
+      "  8991  8997  9006  9012  9040  9066  9093  9097  9098  9131  9158  9162\n",
+      "  9165  9214  9216  9280  9297  9301  9316  9355  9371  9412  9421  9475\n",
+      "  9510  9580  9620  9645  9696  9713  9732  9768  9802  9817  9819  9826\n",
+      "  9839  9846  9947 10004 10062 10065 10072 10103 10107 10108 10138 10167\n",
+      " 10173 10228 10262 10292 10326 10356 10360 10372 10421 10446 10466 10468\n",
+      " 10499 10505 10513 10517 10589 10606 10612 10645 10664 10669 10726 10777\n",
+      " 10835 10838 10839 10848 10855 10877 10897 10941 10963 10971 10977 10997\n",
+      " 11030 11060 11065 11076 11088 11140 11167 11174 11231 11252 11257 11259\n",
+      " 11275 11297 11302 11319 11331 11333 11357 11358 11380 11382 11402 11423\n",
+      " 11446 11447 11500 11501 11522 11585 11623 11670 11728 11736 11759 11761\n",
+      " 11772 11785 11839 11894 11916 11924 11936 11962 11968 11969 11977 11984\n",
+      " 12008 12030 12054 12074 12123 12175 12182 12194 12237 12262 12282 12285\n",
+      " 12341 12348 12351 12370 12376 12386 12399 12449 12507 12513 12518 12522\n",
+      " 12549 12572 12643 12648 12663 12689 12696 12710 12769 12780 12788 12792\n",
+      " 12793 12852 12864 12879 12884 12985 13018 13041 13057 13176 13264 13272\n",
+      " 13274 13275 13292 13303 13333 13379 13427 13428 13442 13451 13454 13500\n",
+      " 13510 13533 13564 13588 13607 13640 13655 13686 13687 13688 13732 13747\n",
+      " 13786 13801 13803 13826 13841 13846 13850 13892 13909 13946 14036 14040\n",
+      " 14046 14060 14080 14152 14161 14183 14195 14210 14240 14278 14331 14354\n",
+      " 14370 14372 14386 14395 14409 14432 14434 14497 14506 14531 14559 14589\n",
+      " 14648 14663 14686 14698 14715 14743 14757 14799 14808 14810 14849 14893\n",
+      " 14902 14929 14937 14947 14953 14958 15005 15012 15018 15036 15066 15069\n",
+      " 15083 15152 15154 15196 15197 15212 15292 15309 15323 15340 15343 15375\n",
+      " 15389 15396 15408 15410 15454 15499 15532 15557 15605 15647 15677 15736\n",
+      " 15745 15756 15769 15809 15824 15876 15882 15900 15906 15941 16027 16030\n",
+      " 16040 16116 16190 16192 16205 16207 16239 16279 16285 16295 16348 16358\n",
+      " 16367 16384 16386 16394 16399 16455 16457 16458 16471 16495 16500 16502\n",
+      " 16520 16541 16542 16598 16623 16643 16651 16665 16673 16679 16713 16725\n",
+      " 16734 16736 16739 16751 16756 16768 16861 16870 16939 16976 17007 17028\n",
+      " 17040 17069 17087 17108 17125 17139 17151 17158 17174 17175 17178 17182\n",
+      " 17189 17221 17258 17341 17360 17370 17381 17395 17396 17415 17432 17450\n",
+      " 17463 17470 17472 17473 17496 17507 17536 17608 17626 17627 17649 17653\n",
+      " 17664 17771 17815 17822 17831 17864 17883 17931 17994 17999 18035 18174\n",
+      " 18209 18250 18274 18307 18327 18403 18423]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  7.7873253e+01\n",
+      "  8.6085976e+01  6.8200005e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  7.78733292e+01\n",
+      "  8.60859299e+01  6.82000580e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  3  24  66  71  94  95 124 134 141 150 163 181 226 261 284 318 320 378\n",
+      " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 6.5550952e+14  4.9376585e+14  3.8510841e+14 ...  1.6802770e+00\n",
+      " -1.1248941e+00 -1.1701980e+00]\n",
+      "FF:[ 6.55509317e+14  4.93765882e+14  3.85108377e+14 ...  1.68027747e+00\n",
+      " -1.12489426e+00 -1.17019880e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   6   79  111  149  155  168  187  195  220  223  252  261  329  343\n",
+      "  347  369  386  392  403  438  439  450  461  524  535  643  656  659\n",
+      "  661  668  722  727  732  742  754  801  816  820  835  837  849  850\n",
+      "  978  993  997 1012 1019 1034 1044 1071 1088 1094 1114 1135 1151 1170\n",
+      " 1190 1212 1273 1275 1277 1289 1290 1308 1311 1337 1364 1379 1394 1430\n",
+      " 1454 1460 1469 1474 1703 1725 1728 1732 1733 1741 1754 1757 1804 1806\n",
+      " 1856 1862 1932 1945 1996 2030 2044 2045 2065 2071 2075 2094 2149 2152\n",
+      " 2163 2180 2182 2215 2254 2357 2362 2370 2392 2398 2428 2484 2519 2521\n",
+      " 2524 2582 2618 2641 2645 2664 2674 2681 2691 2735 2747 2779 2872 2899\n",
+      " 2909 2935 2957 3000 3033]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [-1.3871785e+17 -8.3164397e+16  4.9509505e+16 ...  4.3806694e+01\n",
+      "  9.4386072e+00 -2.4460859e+01]\n",
+      "FF:[-1.38717840e+17 -8.31644654e+16  4.95094495e+16 ...  4.38065948e+01\n",
+      "  9.43864822e+00 -2.44608364e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  80   83  172  173  176  184  215  285  329  338  341  395  403  465\n",
+      "  468  565  572  601  614  636  639  651  660  749  750  806  828  844\n",
+      "  873  952  971  988  992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n",
+      " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n",
+      " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n",
+      " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n",
+      " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n",
+      " 2936 2945 2951 3013 3016]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_input_0\n",
+      "HF: [-1.3871785e+17 -8.3164397e+16  4.9509505e+16 ...  4.3806694e+01\n",
+      "  9.4386072e+00 -2.4460859e+01]\n",
+      "FF:[-1.38717840e+17 -8.31644654e+16  4.95094495e+16 ...  4.38065948e+01\n",
+      "  9.43864822e+00 -2.44608364e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  80   83  172  173  176  184  215  285  329  338  341  395  403  465\n",
+      "  468  565  572  601  614  636  639  651  660  749  750  806  828  844\n",
+      "  873  952  971  988  992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n",
+      " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n",
+      " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n",
+      " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n",
+      " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n",
+      " 2936 2945 2951 3013 3016]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  3.5121140e+01\n",
+      " -3.5587997e+00  9.5641022e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  3.51211472e+01\n",
+      " -3.55898285e+00  9.56410980e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  3  24  66  71  94  95 124 134 141 150 163 181 226 261 284 318 320 378\n",
+      " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_o_proj_in_grad\n",
+      "HF: [-1.6186993e+17 -3.5698813e+17  3.4442975e+16 ... -2.5844165e+02\n",
+      "  2.0677340e+01 -2.4573349e+01]\n",
+      "FF:[-1.61869621e+17 -3.56988336e+17  3.44430865e+16 ... -2.58441467e+02\n",
+      "  2.06775093e+01 -2.45735531e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 93  99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n",
+      " 608 702 720 731 759]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-1.6186993e+17 -2.1968115e+02  8.5754425e+01 ... -6.9909119e+01\n",
+      "  -2.6478451e+01 -7.4195160e+01]\n",
+      " [-3.5698813e+17  3.9582391e+02  5.5431940e+02 ...  1.9529277e+02\n",
+      "   1.2558211e+02  6.7965935e+01]\n",
+      " [ 3.4442975e+16  2.8310864e+02 -8.1522171e+01 ... -2.3606525e+01\n",
+      "  -2.0410315e+01 -1.5228156e+02]\n",
+      " ...\n",
+      " [ 4.0923264e+16 -2.4507169e+02 -8.2614380e+02 ... -2.6583340e+02\n",
+      "  -1.9878247e+02 -2.5844165e+02]\n",
+      " [ 6.9156258e+17  1.3969666e+02 -7.5639044e+02 ... -1.5231053e+02\n",
+      "  -3.3650037e+02  2.0677340e+01]\n",
+      " [ 9.9511712e+16 -3.2348724e+01  3.0624988e+02 ...  1.0391423e+02\n",
+      "   6.0626881e+01 -2.4573349e+01]]\n",
+      "FF:[[-1.61869621e+17 -2.19681122e+02  8.57541504e+01 ... -6.99092026e+01\n",
+      "  -2.64783611e+01 -7.41952515e+01]\n",
+      " [-3.56988336e+17  3.95823853e+02  5.54319275e+02 ...  1.95292725e+02\n",
+      "   1.25582062e+02  6.79659348e+01]\n",
+      " [ 3.44430865e+16  2.83108551e+02 -8.15224686e+01 ... -2.36064014e+01\n",
+      "  -2.04101429e+01 -1.52281570e+02]\n",
+      " ...\n",
+      " [ 4.09233933e+16 -2.45071564e+02 -8.26143555e+02 ... -2.65833405e+02\n",
+      "  -1.98782272e+02 -2.58441467e+02]\n",
+      " [ 6.91562577e+17  1.39696579e+02 -7.56390808e+02 ... -1.52310455e+02\n",
+      "  -3.36500092e+02  2.06775093e+01]\n",
+      " [ 9.95114373e+16 -3.23486938e+01  3.06250122e+02 ...  1.03914482e+02\n",
+      "   6.06264191e+01 -2.45735531e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 93  99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n",
+      " 608 702 720 731 759]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-1.6186993e+17 -3.5698813e+17  3.4442975e+16 ...  4.0923264e+16\n",
+      "   6.9156258e+17  9.9511712e+16]\n",
+      " [-5.3483575e+02  2.6249797e+03 -6.7268573e+02 ... -6.1204077e+03\n",
+      "  -4.3047915e+03 -9.5139771e+01]\n",
+      " [-1.2200641e+01  1.0347147e+02 -2.6777636e+01 ... -1.4766699e+02\n",
+      "  -9.8514114e+01  1.2616925e+01]\n",
+      " ...\n",
+      " [-3.2097631e+00  9.1431990e+00 -1.6333975e+00 ... -6.9996667e+00\n",
+      "  -6.4008064e+00  1.9126304e+00]\n",
+      " [-3.0982289e+00  1.2355285e+01 -3.1715555e+00 ... -4.6754313e+00\n",
+      "  -6.2553053e+00  1.0515085e+00]\n",
+      " [-2.9516125e+00  2.7038031e+00 -6.0580249e+00 ... -1.6555168e+01\n",
+      "   1.3245420e+00 -1.5741113e+00]]\n",
+      "FF:[[-1.61869621e+17 -3.56988336e+17  3.44430865e+16 ...  4.09233933e+16\n",
+      "   6.91562577e+17  9.95114373e+16]\n",
+      " [-5.34834961e+02  2.62497900e+03 -6.72686401e+02 ... -6.12040576e+03\n",
+      "  -4.30479297e+03 -9.51402283e+01]\n",
+      " [-1.22006664e+01  1.03471611e+02 -2.67777309e+01 ... -1.47666946e+02\n",
+      "  -9.85141525e+01  1.26169167e+01]\n",
+      " ...\n",
+      " [-3.20977211e+00  9.14321709e+00 -1.63339353e+00 ... -6.99966621e+00\n",
+      "  -6.40081263e+00  1.91262615e+00]\n",
+      " [-3.09821057e+00  1.23552399e+01 -3.17152786e+00 ... -4.67541933e+00\n",
+      "  -6.25528765e+00  1.05149710e+00]\n",
+      " [-2.95161533e+00  2.70380235e+00 -6.05802393e+00 ... -1.65551491e+01\n",
+      "   1.32455230e+00 -1.57412362e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "8.101851851851851% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.3778828e+16  1.0956941e+03  1.1773144e+02 ... -4.0466427e+01\n",
+      "  -3.1198654e+01 -1.7603550e+01]\n",
+      " [-1.2087128e+18  6.9384756e+03  6.1327003e+01 ...  1.5329468e+01\n",
+      "   7.6757736e+00 -4.5589094e+00]\n",
+      " [-6.7892266e+17  5.4895034e+03  7.6927376e+01 ...  9.1396770e+00\n",
+      "   2.3195824e+01 -6.1995559e+00]\n",
+      " ...\n",
+      " [ 2.6452032e+17  9.9761787e+03  2.2349066e+02 ...  5.7504387e+01\n",
+      "  -8.6791611e-01  4.6890911e+01]\n",
+      " [-6.7528534e+16  3.3856902e+03  2.5189743e+02 ...  2.2824722e+01\n",
+      "   8.7917282e+01 -2.1569672e+01]\n",
+      " [-2.1779064e+17  5.2511855e+03  6.6282043e+01 ...  9.9689598e+00\n",
+      "  -5.5022659e+00 -3.2573143e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.37791458e+16  1.09569678e+03  1.17731285e+02 ... -4.04664154e+01\n",
+      "  -3.11988506e+01 -1.76035423e+01]\n",
+      " [-1.20871251e+18  6.93847900e+03  6.13275528e+01 ...  1.53295393e+01\n",
+      "   7.67594433e+00 -4.55900288e+00]\n",
+      " [-6.78922523e+17  5.48950342e+03  7.69272308e+01 ...  9.13961220e+00\n",
+      "   2.31957569e+01 -6.19959354e+00]\n",
+      " ...\n",
+      " [ 2.64520284e+17  9.97617871e+03  2.23490509e+02 ...  5.75044785e+01\n",
+      "  -8.67943764e-01  4.68908234e+01]\n",
+      " [-6.75287400e+16  3.38569165e+03  2.51897339e+02 ...  2.28247147e+01\n",
+      "   8.79171448e+01 -2.15696106e+01]\n",
+      " [-2.17790679e+17  5.25118652e+03  6.62821960e+01 ...  9.96885872e+00\n",
+      "  -5.50213098e+00 -3.25731125e+01]]\n",
+      "9.809027777777777% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.7.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_7_layers_7_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-7.5522525e+19 -1.3283726e+21 -7.2549753e+20 ...  4.9017162e+01\n",
+      " -9.7436657e+00  8.5870697e+01]\n",
+      "FF:[-7.55228501e+19 -1.32837218e+21 -7.25497390e+20 ...  4.90171394e+01\n",
+      " -9.74382782e+00  8.58707886e+01]\n",
+      "[ True  True  True ...  True False  True]\n",
+      "[   19    64    75 ... 18418 18428 18430]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[23], line 95\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     94\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- W2 --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 95\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m compare_tensors(hf_w2_weight, ff_w2_weight, tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m)\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- Lora --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:47\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     43\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     44\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     45\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "attention_tests=True\n",
+    "for i in range(tot_num_layers-1, -1, -1):\n",
+    "    # HuggingFace filepaths\n",
+    "    hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n",
+    "    hf_BWD_loraB_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0\"\n",
+    "    hf_BWD_loraB_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0\"\n",
+    "    hf_BWD_loraA_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0\"\n",
+    "    hf_BWD_loraA_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0\"\n",
+    "    hf_loraA_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    hf_loraB_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    hf_BWD_lora_dropout_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.go_0\"\n",
+    "    hf_BWD_lora_dropout_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.gi_0\"\n",
+    "    hf_BWD_w2_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0\"\n",
+    "    hf_BWD_w2_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0\"\n",
+    "    hf_w2_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.weight\"\n",
+    "    hf_BWD_w3_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0\"\n",
+    "    hf_BWD_w3_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0\"\n",
+    "    hf_BWD_w1_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0\"\n",
+    "    hf_BWD_w1_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0\"\n",
+    "    hf_BWD_act_fn_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0\"\n",
+    "    hf_BWD_act_fn_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.go_0\"\n",
+    "    hf_BWD_ffn_norm_out = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0\"\n",
+    "    hf_BWD_ffn_norm_in = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0\"\n",
+    "    hf_BWD_attn_out_out = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0\"\n",
+    "    hf_BWD_attn_q_in = f\"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n",
+    "    hf_FWD_w1_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
+    "    hf_FWD_w3_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\"\n",
+    "    hf_FWD_act_fn_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0\"\n",
+    "    hf_BWD_attn_oproj_in = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n",
+    "    hf_attn_qproj_weight = f\"{hf_path}/layers.{i}.self_attn.q_proj.weight\"\n",
+    "    hf_attn_kproj_weight = f\"{hf_path}/layers.{i}.self_attn.k_proj.weight\"\n",
+    "    hf_attn_vproj_weight = f\"{hf_path}/layers.{i}.self_attn.v_proj.weight\"\n",
+    "    hf_attn_oproj_weight = f\"{hf_path}/layers.{i}.self_attn.o_proj.weight\"\n",
+    "    \n",
+    "    # FlexFlow filepaths\n",
+    "    ff_BWD_w2_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n",
+    "    ff_BWD_w2_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n",
+    "    ff_BWD_w2_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_pre_input_0\"\n",
+    "    ff_w2_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_weight_0\"\n",
+    "    ff_BWD_ssm_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0\"\n",
+    "    ff_BWD_ssm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0\"\n",
+    "    ff_BWD_ssm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1\"\n",
+    "    ff_BWD_w3_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_BWD_w3_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_input_0\"\n",
+    "    ff_BWD_lora_A_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n",
+    "    ff_BWD_lora_B_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n",
+    "    ff_lora_A_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n",
+    "    ff_lora_B_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n",
+    "    ff_BWD_w1_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_BWD_w1_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_input_0\"\n",
+    "    ff_BWD_w1_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_pre_input_0\"\n",
+    "    ff_w1_weight = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_weight_0\"\n",
+    "    ff_BWD_ffn_norm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_0\"\n",
+    "    ff_BWD_ffn_norm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_1\"\n",
+    "    ff_BWD_ffn_norm_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_0\"\n",
+    "    ff_BWD_attn_out = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n",
+    "    ff_BWD_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_input_0\"\n",
+    "    ff_BWD_ssm_cached_w1_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w1_output\"\n",
+    "    ff_BWD_ssm_cached_w3_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w3_output\"\n",
+    "    ff_FWD_w1_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_FWD_w3_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_FWD_act_fnc_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_act_fn_output\"\n",
+    "    ff_BWD_attn_o_proj_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "    ff_attn_oproj_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_weight_0\"\n",
+    "    \n",
+    "    \n",
+    "    # HuggingFace checks\n",
+    "    print(\"\\nHuggingface checks:\")\n",
+    "    if i == tot_num_layers-1:\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
+    "\n",
+    "    compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n",
+    "    check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n",
+    "    if i == tot_num_layers-1:\n",
+    "        check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n",
+    "\n",
+    "    # FlexFlow checks\n",
+    "    print(\"\\nFlexFlow checks:\")\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n",
+    "    \n",
+    "    # HF-FlexFlow checks\n",
+    "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
+    "    print(\"-- W2 --\")\n",
+    "    compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    \n",
+    "    print(\"-- Lora --\")\n",
+    "    compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n",
+    "    compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n",
+    "    compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "    \n",
+    "    print(\"-- W2/W1/W3 --\")\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n",
+    "    compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    \n",
+    "    print(\"-- Attention --\")\n",
+    "    compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n",
+    "    hidden_size = 768\n",
+    "    qProjSize = 64\n",
+    "    num_heads = 12\n",
+    "    num_new_tokens = num_tokens = 24\n",
+    "    if attention_tests:\n",
+    "        # compare attn weight tensors\n",
+    "        ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n",
+    "        ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n",
+    "        \n",
+    "        hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n",
+    "        \n",
+    "        assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n",
+    "        \n",
+    "        # Compare attn outproj grad in tensors\n",
+    "        compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n",
+    "        \n",
+    "        ########### Compare value projs grads ######################\n",
+    "        # 1. compare qk prods softmax\n",
+    "        hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.qk_prods_softmax.output_0\"\n",
+    "        ff_attn_qk_prods_softmax = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax\"\n",
+    "        \n",
+    "        hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n",
+    "        ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "\n",
+    "        for head_idx in range(num_heads):\n",
+    "            hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "            ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "            assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "        \n",
+    "        # 2. compare attn heads grads\n",
+    "        hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n",
+    "        ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "\n",
+    "        hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n",
+    "        ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n",
+    "        # NEED TO VISUALLY INSPECT\n",
+    "        compare_loaded_tensors(hf_attn_heads_grads, ff_attn_heads_grads)\n",
+    "\n",
+    "        # 3. vproj grads\n",
+    "        hf_vproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0\"\n",
+    "        ff_vproj_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_v_proj_in_grad\"\n",
+    "\n",
+    "        hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n",
+    "        ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n",
+    "        compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)\n",
+    "        \n",
+    "        \n",
+    "        ##############################\n",
+    "        hf_value_states = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.value_states.output_0\"\n",
+    "        hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "        # print(hf_value_states.shape)\n",
+    "        ff_value_states = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_vcache\"\n",
+    "        ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
+    "        # print(ff_value_states.shape)\n",
+    "        assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "        ########## Compare key and query projs grads ##################\n",
+    "        ff_devQKVPRojArray = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n",
+    "        ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n",
+    "        ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n",
+    "        ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n",
+    "        ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n",
+    "        assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n",
+    "\n",
+    "        # simulate qk_prods_softmax\n",
+    "        ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "        ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n",
+    "        ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n",
+    "        ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
+    "        ff_value_states = torch.from_numpy(ff_value_states)\n",
+    "        ff_value_states = ff_value_states.permute(1,0,2)\n",
+    "        # print(ff_attn_heads_grads.shape)\n",
+    "        # print(ff_value_states.shape)\n",
+    "        simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
+    "        #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
+    "        #print(\"Simulated QK prods grads:\")\n",
+    "        #print(simulated_qk_prods_softmax_grads[0,:,:])\n",
+    "\n",
+    "        # qk prods softmax right before softmax\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.qk_prods_softmax.go_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        \n",
+    "        mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n",
+    "        mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "        pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n",
+    "        print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n",
+    "        # print(hf_qk_prods_softmax2[:2,:,0])\n",
+    "        # print(ff_qk_prods_softmax2[:2,:,0])\n",
+    "        assert(pct_mismatch <= 0.1)\n",
+    "\n",
+    "        # qk prods softmax right after softmax\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.pre_softmax.gi_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        compare_loaded_tensors(hf_qk_prods_softmax2, ff_qk_prods_softmax2)\n",
+    "        \n",
+    "        # qk prods softmax after mask\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.matmul_op.go_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in_masked\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "\n",
+    "        # Compare query activation\n",
+    "        hf_query_activation = hf_path + f\"/fwd_step_0_layers.11.self_attn.query_activation.output_0\"\n",
+    "        hf_query_activation = torch.load(hf_query_activation)\n",
+    "        ff_query_activation = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_query_activation\"\n",
+    "        ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n",
+    "        hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "        # assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
+    "        # print(hf_query_activation[:,0,:])\n",
+    "        # print()\n",
+    "        # print(ff_query_activation[:,0,:])\n",
+    "        # assert False\n",
+    "        # compare_loaded_tensors(hf_query_activation, ff_query_activation)\n",
+    "        check_rope = False\n",
+    "        if check_rope:\n",
+    "        ########################################## ROPE and Kproj ##########################################\n",
+    "\n",
+    "            # Compare FF kproj with intermediate kproj data from HF\n",
+    "            hf_kproj_grads_post_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_post_rotary.go_0\"\n",
+    "            hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
+    "            hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
+    "            # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
+    "            # Check hf ROPE \n",
+    "            cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
+    "            cos = cos.cuda()\n",
+    "            sin = sin.cuda()\n",
+    "            # query_states:  torch.Size([1, 12, 24, 64])\n",
+    "            # key_states:  torch.Size([1, 12, 24, 64])\n",
+    "            # position_ids:  torch.Size([1, 24])\n",
+    "            # tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "            #          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "            query_states = torch.zeros([1, 12, 24, 64]).cuda()\n",
+    "            position_ids = torch.arange(24).unsqueeze(0).cuda()\n",
+    "            query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n",
+    "            hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
+    "            # print(hf_kproj_grads_post_rotary[:,:,0])\n",
+    "            \n",
+    "            hf_kproj_grads_before_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_before_rotary.go_0\"\n",
+    "            hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
+    "            hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
+    "            # print(hf_kproj_grads_before_rotary[:,:,0])\n",
+    "            # Compare HF rope with manual ROPE\n",
+    "            assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "            # Compare HF Kproj with FF Kproj (before ROPE) \n",
+    "            ff_kproj_pre = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj_pre\"\n",
+    "            ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "            # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
+    "            #print(ff_kproj_pre[:,:,0])\n",
+    "            mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "            mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "            pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
+    "            print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n",
+    "            assert(pct_mismatch <= 0.05)\n",
+    "            #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "            \n",
+    "            ff_kproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj\"\n",
+    "            ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "            # print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "            #print(ff_kproj[:,:,0])\n",
+    "            mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "            mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "            pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
+    "            print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n",
+    "            assert(pct_mismatch <= 0.05)\n",
+    "            #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "        \n",
+    "        \n",
+    "            #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
+    "            hf_kproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0\"\n",
+    "            hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
+    "            #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
+    "            #print(hf_kproj_grads[:,:64])\n",
+    "            reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "            #print(reshaped_tensor.shape)\n",
+    "            assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "        ########################################## Qproj (with ROPE) ##########################################\n",
+    "\n",
+    "        # Compare QProj\n",
+    "        hf_qproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0\"\n",
+    "        hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
+    "        # print(\"HF Qproj:\")\n",
+    "        # print(hf_qproj_grads.shape)\n",
+    "        reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "        # print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
+    "        # print(reshaped_tensor[:,:,0])\n",
+    "        ff_qproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n",
+    "        ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
+    "        # print(\"FF Qproj:\")\n",
+    "        # print(ff_qproj.shape)\n",
+    "        # print(ff_qproj[:,:,0])\n",
+    "        assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    hf_attn_in = f\"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0\"\n",
+    "    hf_attn_in = torch.load(hf_attn_in)\n",
+    "    hf_attn_in = hf_attn_in.squeeze().T\n",
+    "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
+    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    print(hf_attn_in)\n",
+    "\n",
+    "    ff_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in\"\n",
+    "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
+    "    print(\"ff_attn_in: \", ff_attn_in.shape)\n",
+    "    print(ff_attn_in)\n",
+    "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
+    "\n",
+    "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
+    "    mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
+    "    assert(pct_mismatch <= 0.1)\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.01614726  0.01363804  0.01768043 ...  0.00724926 -0.00149747\n",
+      " -0.01781223]\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = np.fromfile(\"/usr0/home/goliaro/.cache/flexflow/weights/goliaro/llama-160m-lora-full/full-precision/layers_11_feed_forward_w2_lora_A_weight\", dtype=np.float32)\n",
+    "print(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# value states: torch.Size([1, 12, 24, 64])\n",
+    "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n",
+    "key_states = value_states\n",
+    "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n",
+    "# query_states:  torch.Size([1, 12, 24, 64])\n",
+    "# key_states:  torch.Size([1, 12, 24, 64])\n",
+    "# position_ids:  torch.Size([1, 24])\n",
+    "# tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "#          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "query_states = torch.zeros([1, 12, 24, 64])\n",
+    "position_ids = torch.arange(24).unsqueeze(0)\n",
+    "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n",
+    "key_states = key_states.squeeze()\n",
+    "print(key_states.shape)\n",
+    "print(key_states[0,:,:])\n",
+    "print(hf_kproj_grads_before_rotary.shape)\n",
+    "print(hf_kproj_grads_before_rotary[:,:,0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+       "         18, 19, 20, 21, 22, 23]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.arange(24).unsqueeze(0).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 12, 24, 24])\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "layer_num = 11\n",
+    "hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n",
+    "ff_qk_prods_softmax = f\"{ff_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "hf_value_states = f\"{hf_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "\n",
+    "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n",
+    "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n",
+    "print(hf_qk_prods_softmax.shape)\n",
+    "#print(ff_qk_prods_softmax.shape)\n",
+    "#print(hf_qk_prods_softmax[:,:,0])\n",
+    "#print()\n",
+    "#print(ff_qk_prods_softmax[:,:,0])\n",
+    "\n",
+    "for head_idx in range(12):\n",
+    "    hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "    ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "    assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "\n",
+    "\n",
+    "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n",
+    "print(hf_value_states.shape)\n",
+    "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n",
+    "print()\n",
+    "print(attn_output.shape)\n",
+    "print(attn_output.transpose(1, 2).contiguous().shape)\n",
+    "print(\"Hf attn heads\")\n",
+    "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n",
+    "\n",
+    "print(\"Attn heads grads:\")\n",
+    "hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "print(torch.load(hf_attn_heads_grads).shape)\n",
+    "print(\"HF value grads:\")\n",
+    "vproj_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "print(torch.load(vproj_grads).shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([2, 3, 4])\n",
+      "torch.Size([4, 3, 2])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(2,3,4)\n",
+    "print(a.shape)\n",
+    "print(a.T.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[   0.0000,    0.0000,    0.0000,  ...,    0.0000,    0.0000,\n",
+      "             0.0000],\n",
+      "         [  27.8890,  -21.5089,   45.8214,  ...,    5.4010,  -10.8787,\n",
+      "            39.7619],\n",
+      "         [  19.2197,   27.4681,  -68.7141,  ...,  102.3280,   66.7925,\n",
+      "          -160.8711],\n",
+      "         ...,\n",
+      "         [  63.9532,   17.4273,  -29.4416,  ...,  101.6105,   67.5937,\n",
+      "          -198.4432],\n",
+      "         [  31.2799,   13.0724,  -44.7179,  ...,  132.4898,   42.3135,\n",
+      "          -194.4037],\n",
+      "         [  42.3453,  -16.2693,  -55.7386,  ...,   90.5921,   52.2032,\n",
+      "          -124.1802]]], device='cuda:0')\n",
+      "tensor([[[-1.1845e+06, -6.7460e+05,  7.4494e+05,  ..., -9.1441e+05,\n",
+      "          -1.4912e+05,  3.5769e+06],\n",
+      "         [-7.3920e+01, -7.9389e+01,  1.1027e+02,  ..., -7.3020e+01,\n",
+      "          -2.3540e+01,  3.4587e+02],\n",
+      "         [-5.3885e+01, -1.7373e+01, -1.9780e+01,  ...,  4.1291e+01,\n",
+      "           5.5099e+01,  5.5910e+01],\n",
+      "         ...,\n",
+      "         [-2.1948e+01, -3.2109e+01,  2.8364e+01,  ...,  3.4321e+01,\n",
+      "           5.0713e+01,  5.6592e+01],\n",
+      "         [-4.4339e+01, -2.8339e+01,  1.4070e+01,  ...,  6.2797e+01,\n",
+      "           3.0760e+01,  6.1743e+01],\n",
+      "         [-1.6287e+01, -5.0413e+01, -1.9940e+01,  ...,  4.3766e+01,\n",
+      "           4.7833e+01,  4.7295e+01]]], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n",
+    "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n",
+    "a = torch.load(a)\n",
+    "b = torch.load(b)\n",
+    "print(a)\n",
+    "print(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "for layer_num in range(12):\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n",
+    "    hf_w1_weight = f\"{hf_path}/layers.{layer_num}.mlp.gate_proj.weight\"\n",
+    "    ff_w1_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n",
+    "    hf_w3_weight = f\"{hf_path}/layers.{layer_num}.mlp.up_proj.weight\"\n",
+    "    ff_w3_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n",
+    "    hf_w2_weight = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    ff_w2_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/alignment/opt_alignment_tests.ipynb b/tests/peft/alignment/opt_alignment_tests.ipynb
new file mode 100644
index 0000000000..ca679b1857
--- /dev/null
+++ b/tests/peft/alignment/opt_alignment_tests.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch\n",
+    "from align_test_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "--- LM head ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "--- Final Norm ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "qProjSize = 64\n",
+    "num_heads = 12\n",
+    "num_tokens = 25\n",
+    "for i in range(tot_num_layers):\n",
+    "    hf_base = os.path.join(hf_path, f\"fwd_step_0_decoder.layers.{i}.\")\n",
+    "    ff_base = os.path.join(ff_path, f\"fwd_step_0_layers_{i}_layers_{i}_\")\n",
+    "    \n",
+    "    # LayerNorm\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n",
+    "    ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.output_0\"\n",
+    "    ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "\n",
+    "    # # Attention QKV proj\n",
+    "    # print(\"---Attn---\")\n",
+    "    # ff_tensor = ff_base + \"attention_shard_0_qkv_proj_output\"\n",
+    "    # ff_tensor = load_ff_tensor(ff_tensor, [qProjSize, num_heads, 3, num_tokens])\n",
+    "    # ff_q_proj = ff_tensor[:,:,0,:]\n",
+    "    # ff_k_proj = ff_tensor[:,:,1,:]\n",
+    "    # ff_v_proj = ff_tensor[:,:,2,:]\n",
+    "    # hf_q_proj = hf_base + \"self_attn.q_proj.output_0\"\n",
+    "    # hf_q_proj = load_hf_tensor(hf_q_proj).squeeze().T\n",
+    "    # hf_q_proj = hf_q_proj.reshape(12,64,25)\n",
+    "    # hf_q_proj = np.transpose(hf_q_proj, (1,0,2))\n",
+    "    # hf_k_proj = hf_base + \"self_attn.k_proj.output_0\"\n",
+    "    # hf_k_proj = load_hf_tensor(hf_k_proj).squeeze().T\n",
+    "    # hf_k_proj = hf_k_proj.reshape(12,64,25)\n",
+    "    # hf_k_proj = np.transpose(hf_k_proj, (1,0,2))\n",
+    "    # hf_v_proj = hf_base + \"self_attn.v_proj.output_0\"\n",
+    "    # hf_v_proj = load_hf_tensor(hf_v_proj).squeeze().T\n",
+    "    # hf_v_proj = hf_v_proj.reshape(12,64,25)\n",
+    "    # hf_v_proj = np.transpose(hf_v_proj, (1,0,2))\n",
+    "    # compare_loaded_tensors(hf_q_proj/np.sqrt(qProjSize), ff_q_proj)\n",
+    "    # compare_loaded_tensors(hf_k_proj, ff_k_proj)\n",
+    "    # compare_loaded_tensors(hf_v_proj, ff_v_proj)\n",
+    "\n",
+    "    # Compare attn bias, residuals\n",
+    "    print(\"--- Attn bias + residual ---\")\n",
+    "    ff_residual1 = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_input_1\"\n",
+    "    ff_residual2 = ff_base + \"attention_layer_norm_shard_0_output_0\"\n",
+    "    compare_flexflow_tensors(ff_residual1, ff_residual2)\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_residual2)\n",
+    "    ff_tensor = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_output_0\"\n",
+    "    hf_tensor = hf_base + \"final_layer_norm.input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    \n",
+    "    print(\"--- MLP ---\")\n",
+    "    hf_tensor = hf_base + \"fc1.input_0\"\n",
+    "    ff_tensor = ff_base + \"fc1_shard_0_input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    hf_tensor = hf_base + \"fc2.input_0\"\n",
+    "    ff_tensor = ff_base + \"fc2_shard_0_input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "# LM head\n",
+    "print(\"\\n--- LM head ---\")\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.input_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "# Final layer norm\n",
+    "print(\"\\n--- Final Norm ---\")\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.input_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "ff_tensor1 = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_input_activation\"\n",
+    "# compare_flexflow_tensors_shortest(ff_tensor, ff_tensor1)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.output_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_1\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_1\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_mean\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_2\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_rstd\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[17], line 22\u001b[0m\n\u001b[1;32m     19\u001b[0m compare_flexflow_tensors(ff_tensor, ff_tensor1)\n\u001b[1;32m     20\u001b[0m compare_tensors(hf_tensor, ff_tensor) \u001b[38;5;66;03m# fails\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Compare fwd input/output of layernorm\u001b[39;00m\n\u001b[1;32m     25\u001b[0m hf_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_decoder.final_layer_norm.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Compare backward pass\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n",
+    "\n",
+    "hf_tensor1 = hf_path + \"/bwd_step_0_decoder.final_layer_norm.go_0\"\n",
+    "compare_hf_tensors(hf_tensor, hf_tensor1)\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor1, ff_tensor)\n",
+    "\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_decoder.final_layer_norm.gi_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_0\"\n",
+    "ff_tensor1 = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_1\"\n",
+    "compare_flexflow_tensors(ff_tensor, ff_tensor1)\n",
+    "compare_tensors(hf_tensor, ff_tensor) # fails"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\n",
+      "HF: [ 0.0193019  -1.0467215   0.21579844 ...  0.04534929 -0.25642633\n",
+      "  0.10879952]\n",
+      "FF:[ 0.01458706 -1.02212262  0.20589906 ...  0.04446212 -0.25625792\n",
+      "  0.108039  ]\n",
+      "[ True False  True ...  True  True  True]\n",
+      "[    1     3     7 ... 19170 19174 19188]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 16\u001b[0m\n\u001b[1;32m     14\u001b[0m hf_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     15\u001b[0m ff_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_fc1_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_fc1_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# LORA input\u001b[39;00m\n\u001b[1;32m     20\u001b[0m hf_lora_A_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:32\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     29\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     30\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers):\n",
+    "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.self_attn_layer_norm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_layer_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "   \n",
+    "    hf_ffn_norm_in = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.input_0\"\n",
+    "    ff_ffn_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_ffn_norm_in, ff_ffn_norm_in)\n",
+    "    \n",
+    "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_1\"\n",
+    "    # compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    hf_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\"\n",
+    "    ff_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\"\n",
+    "    compare_tensors(hf_fc1_in, ff_fc1_in)\n",
+    "\n",
+    "\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.final_layer_norm.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\n",
+      "HF: [-0.00542103 -1.781267    0.16552497 ... -0.77217525 -0.5760026\n",
+      "  0.04363118]\n",
+      "FF:[ 0.03817766 -1.5644939   0.22477378 ... -0.94569921 -0.43960798\n",
+      " -0.06447437]\n",
+      "[False False False ... False False False]\n",
+      "[    0     1     2 ... 19197 19198 19199]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m ff_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     21\u001b[0m ff_FWD_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_FWD_norm_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_FWD_norm_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     23\u001b[0m compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n\u001b[1;32m     25\u001b[0m hf_BWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_decoder.final_layer_norm.gi_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:29\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     30\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "\n",
+    "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "\n",
+    "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "\n",
+    "# Compare fwd input/output of layernorm\n",
+    "hf_FWD_norm_in = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.input_0\"\n",
+    "hf_FWD_norm_out = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.output_0\"\n",
+    "ff_FWD_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\"\n",
+    "ff_FWD_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_FWD_norm_in, ff_FWD_norm_in)\n",
+    "compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n",
+    "\n",
+    "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
deleted file mode 100644
index e2a8978ea3..0000000000
--- a/tests/peft/alignment_tests.ipynb
+++ /dev/null
@@ -1,1427 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import os, torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hf_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors\"\n",
-    "ff_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors\"\n",
-    "def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):\n",
-    "    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n",
-    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
-    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
-    "        assert(len(hf_tensor) == 1)\n",
-    "        hf_tensor = hf_tensor[0]\n",
-    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
-    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
-    "    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
-    "\n",
-    "    len_hf_tensor = hf_tensor.shape[0]\n",
-    "    ff_tensor = ff_tensor[:len_hf_tensor]\n",
-    "    \n",
-    "    mismatches = []\n",
-    "    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n",
-    "        print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}\")\n",
-    "        print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n",
-    "        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n",
-    "        print(mismatches)\n",
-    "        #print(np.nonzero(hf_tensor)[0])\n",
-    "        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n",
-    "        # print(ff_tensor[36], hf_tensor[36])\n",
-    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "    assert(len(mismatches) <= .05*len_hf_tensor)\n",
-    "    print(\"Ok!\")\n",
-    "def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):\n",
-    "    assert(os.path.exists(hf_tensor_filepath))\n",
-    "    assert(os.path.exists(ff_tensor1_filepath))\n",
-    "    assert(os.path.exists(ff_tensor2_filepath))\n",
-    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
-    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
-    "        assert(len(hf_tensor) == 1)\n",
-    "        hf_tensor = hf_tensor[0]\n",
-    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
-    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
-    "    ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')\n",
-    "    ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')\n",
-    "\n",
-    "    len_hf_tensor = hf_tensor.shape[0]\n",
-    "    ff_tensor1 = ff_tensor1[:len_hf_tensor]\n",
-    "    ff_tensor2 = ff_tensor2[:len_hf_tensor]\n",
-    "    ff_tensor = ff_tensor1 - ff_tensor2\n",
-    "    \n",
-    "    mismatches = []\n",
-    "    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n",
-    "        print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}\")\n",
-    "        print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n",
-    "        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n",
-    "        print(mismatches)\n",
-    "        #print(np.nonzero(hf_tensor)[0])\n",
-    "        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n",
-    "        # print(ff_tensor[36], hf_tensor[36])\n",
-    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "    assert(len(mismatches) <= .05*len_hf_tensor)\n",
-    "    print(\"Ok!\")\n",
-    "def compare_hf_tensors(tensor1_fp, tensor2_fp):\n",
-    "    assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n",
-    "    hf_tensor1 = torch.load(tensor1_fp)\n",
-    "    hf_tensor2 = torch.load(tensor2_fp)\n",
-    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
-    "        assert(len(hf_tensor1) == 1)\n",
-    "        hf_tensor1 = hf_tensor1[0]\n",
-    "    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n",
-    "        assert(len(hf_tensor2) == 1)\n",
-    "        hf_tensor2 = hf_tensor2[0]\n",
-    "    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n",
-    "    hf_tensor1 = torch.nan_to_num(hf_tensor1)\n",
-    "    hf_tensor2 = torch.nan_to_num(hf_tensor2)\n",
-    "    if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):\n",
-    "        print(f\"mismatch between {tensor1_fp} and {tensor2_fp}\")\n",
-    "        print(hf_tensor1)\n",
-    "        print(hf_tensor2)\n",
-    "        print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))\n",
-    "        mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]\n",
-    "        print(mismatches)\n",
-    "        assert(False)\n",
-    "    print(\"Ok!\")\n",
-    "\n",
-    "def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):\n",
-    "    assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n",
-    "    hf_tensor_sum = torch.load(tensor_sum_fp)\n",
-    "    hf_tensor1 = torch.load(tensor1_fp)\n",
-    "    hf_tensor2 = torch.load(tensor2_fp)\n",
-    "    if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:\n",
-    "        assert(len(hf_tensor_sum) == 1)\n",
-    "        hf_tensor_sum = hf_tensor_sum[0]\n",
-    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
-    "        assert(len(hf_tensor1) == 1)\n",
-    "        hf_tensor1 = hf_tensor1[0]\n",
-    "    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n",
-    "        assert(len(hf_tensor2) == 1)\n",
-    "        hf_tensor2 = hf_tensor2[0]\n",
-    "    assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)\n",
-    "    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n",
-    "    hf_tensor1 = torch.nan_to_num(hf_tensor1)\n",
-    "    hf_tensor2 = torch.nan_to_num(hf_tensor2)\n",
-    "    hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)\n",
-    "    sum_check_tensor = hf_tensor1 + hf_tensor2\n",
-    "    if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):\n",
-    "        print(f\"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}\")\n",
-    "        print(tensor_sum_fp)\n",
-    "        print(sum_check_tensor)\n",
-    "        print(hf_tensor1)\n",
-    "        print(hf_tensor2)\n",
-    "        print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))\n",
-    "        mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]\n",
-    "        print(mismatches)\n",
-    "        assert(False)\n",
-    "    print(\"Ok!\")\n",
-    "def check_hf_zero_tensor(hf_tensor_fp):\n",
-    "    assert(os.path.exists(hf_tensor_fp))\n",
-    "    hf_tensor1 = torch.load(hf_tensor_fp)\n",
-    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
-    "        assert(len(hf_tensor1) == 1)\n",
-    "        hf_tensor1 = hf_tensor1[0]\n",
-    "    assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)\n",
-    "def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=\"\"):\n",
-    "    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n",
-    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
-    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
-    "        assert(len(hf_tensor) == 1)\n",
-    "        hf_tensor = hf_tensor[0]\n",
-    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
-    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
-    "    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
-    "\n",
-    "    len_hf_tensor = hf_tensor.shape[0]\n",
-    "    ff_tensor = ff_tensor[:len_hf_tensor]\n",
-    "\n",
-    "    print(f\"{txt} - HF tensor:\")\n",
-    "    print(hf_tensor)\n",
-    "    print(f\"{txt} - FF tensor: \")\n",
-    "    print(ff_tensor)\n",
-    "def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):\n",
-    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
-    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
-    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
-    "\n",
-    "    if (ff_tensor1.shape != ff_tensor2.shape):\n",
-    "        print(ff_tensor1.shape, ff_tensor2.shape)\n",
-    "    assert(ff_tensor1.shape == ff_tensor2.shape)\n",
-    "\n",
-    "    if max_len > -1:\n",
-    "        ff_tensor1 = ff_tensor1[:max_len]\n",
-    "        ff_tensor2 = ff_tensor2[:max_len]\n",
-    "    \n",
-    "    mismatches = []\n",
-    "    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n",
-    "        print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n",
-    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
-    "        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n",
-    "        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n",
-    "        print(mismatches)\n",
-    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
-    "    print(\"Ok!\")\n",
-    "def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n",
-    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
-    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
-    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
-    "    minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])\n",
-    "    ff_tensor1 = ff_tensor1[:minlen]\n",
-    "    ff_tensor2 = ff_tensor2[:minlen]\n",
-    "    mismatches = []\n",
-    "    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n",
-    "        print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n",
-    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
-    "        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n",
-    "        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n",
-    "        print(mismatches)\n",
-    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
-    "    print(\"Ok!\")\n",
-    "def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n",
-    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
-    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
-    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
-    "    ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')\n",
-    "    \n",
-    "    ff_sum = ff_tensor1 + ff_tensor2\n",
-    "    assert(ff_tensor1.shape == ff_tensor2.shape)\n",
-    "    \n",
-    "    mismatches = []\n",
-    "    if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):\n",
-    "        print(f\"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}\")\n",
-    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
-    "        print(f\"Sum Tensor: {ff_tensor_sum}\\nActual sum:{ff_sum}\")\n",
-    "        print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))\n",
-    "        mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]\n",
-    "        print(mismatches)\n",
-    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
-    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
-    "    print(\"Ok!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n"
-     ]
-    }
-   ],
-   "source": [
-    "tot_num_layers = 12\n",
-    "for layer_num in range(tot_num_layers):\n",
-    "    hf_input_ln_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.input_layernorm.output_0\"\n",
-    "    ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_RMSNorm_shard-id_0_output_0\"\n",
-    "    if layer_num > 0:\n",
-    "        ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_norm_shard-id_0_output_1\"\n",
-    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
-    "    hf_attn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.o_proj.output_0\"\n",
-    "    ff_attn_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n",
-    "    compare_tensors(hf_attn_out, ff_attn_out)\n",
-    "    hf_ffn_norm_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.post_attention_layernorm.output_0\"\n",
-    "    ff_ffn_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_1\"\n",
-    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
-    "    # w1\n",
-    "    hf_gate_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n",
-    "    ff_gate_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
-    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n",
-    "    # w3\n",
-    "    hf_up_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\" \n",
-    "    ff_up_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
-    "    compare_tensors(hf_up_proj_out, ff_up_proj_out)\n",
-    "    # w2\n",
-    "    hf_down_proj_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.input_0\"\n",
-    "    hf_down_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.output_0\"\n",
-    "    ff_down_proj_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n",
-    "    ff_down_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n",
-    "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
-    "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
-    "    # LORA input\n",
-    "    hf_lora_A_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n",
-    "    ff_lora_A_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
-    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
-    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
-    "    # LORA weights\n",
-    "    hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
-    "    ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
-    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
-    "    hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
-    "    ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
-    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
-    "    # LORA intermediate hf\n",
-    "    hf_lora_A_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n",
-    "    hf_lora_B_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n",
-    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
-    "    # LORA output\n",
-    "    hf_lora_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n",
-    "    ff_lora_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
-    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
-    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
-    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
-    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
-    "    \n",
-    "\n",
-    "# After last layer only\n",
-    "hf_norm_out = f\"{hf_weight_base_path}/fwd_step_0_norm.output_0\"\n",
-    "ff_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n",
-    "compare_tensors(hf_norm_out, ff_norm_out)\n",
-    "hf_lm_head_out = f\"{hf_weight_base_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
-    "ff_lm_head_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
-    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n"
-     ]
-    }
-   ],
-   "source": [
-    "tot_num_layers = 12\n",
-    "\n",
-    "ff_BWD_softmax_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
-    "\n",
-    "hf_BWD_lm_head_out = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
-    "ff_BWD_lm_head_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
-    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
-    "# compare weights\n",
-    "hf_lm_head_weight = f\"{hf_weight_base_path}/base_model.model.lm_head.weight\"\n",
-    "ff_lm_head_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_weight_0\"\n",
-    "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n",
-    "hf_BWD_lm_head_in = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
-    "ff_BWD_lm_head_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_input_0\"\n",
-    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
-    "# # Manually check the matmul\n",
-    "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n",
-    "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n",
-    "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n",
-    "# print(ff_tensor_out.shape)\n",
-    "# print(ff_weight.shape)\n",
-    "# print(np.matmul(ff_weight, ff_tensor_out))\n",
-    "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n",
-    "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
-    "\n",
-    "hf_BWD_norm_out = f\"{hf_weight_base_path}/bwd_step_0_norm.go_0\"\n",
-    "ff_BWD_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_0\"\n",
-    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
-    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
-    "ff_BWD_norm_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_weight_0\"\n",
-    "hf_FWD_norm_weight = f\"{hf_weight_base_path}/base_model.model.model.norm.weight\"\n",
-    "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
-    "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
-    "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n",
-    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from torch import nn\n",
-    "class LlamaRotaryEmbedding(nn.Module):\n",
-    "    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n",
-    "        super().__init__()\n",
-    "\n",
-    "        self.dim = dim\n",
-    "        self.max_position_embeddings = max_position_embeddings\n",
-    "        self.base = base\n",
-    "        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n",
-    "        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n",
-    "\n",
-    "        # Build here to make `torch.jit.trace` work.\n",
-    "        self._set_cos_sin_cache(\n",
-    "            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n",
-    "        )\n",
-    "\n",
-    "    def _set_cos_sin_cache(self, seq_len, device, dtype):\n",
-    "        self.max_seq_len_cached = seq_len\n",
-    "        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n",
-    "\n",
-    "        freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n",
-    "        # Different from paper, but it uses a different permutation in order to obtain the same calculation\n",
-    "        emb = torch.cat((freqs, freqs), dim=-1)\n",
-    "        self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n",
-    "        self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n",
-    "\n",
-    "    def forward(self, x, seq_len=None):\n",
-    "        # x: [bs, num_attention_heads, seq_len, head_size]\n",
-    "        if seq_len > self.max_seq_len_cached:\n",
-    "            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n",
-    "\n",
-    "        return (\n",
-    "            self.cos_cached[:seq_len].to(dtype=x.dtype),\n",
-    "            self.sin_cached[:seq_len].to(dtype=x.dtype),\n",
-    "        )\n",
-    "def rotate_half(x):\n",
-    "    \"\"\"Rotates half the hidden dims of the input.\"\"\"\n",
-    "    x1 = x[..., : x.shape[-1] // 2] # first half\n",
-    "    x2 = x[..., x.shape[-1] // 2 :] # second half\n",
-    "    return torch.cat((x2, -x1), dim=-1)\n",
-    "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n",
-    "    \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n",
-    "\n",
-    "    Args:\n",
-    "        q (`torch.Tensor`): The query tensor.\n",
-    "        k (`torch.Tensor`): The key tensor.\n",
-    "        cos (`torch.Tensor`): The cosine part of the rotary embedding.\n",
-    "        sin (`torch.Tensor`): The sine part of the rotary embedding.\n",
-    "        position_ids (`torch.Tensor`):\n",
-    "            The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n",
-    "            used to pass offsetted position ids when working with a KV-cache.\n",
-    "        unsqueeze_dim (`int`, *optional*, defaults to 1):\n",
-    "            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n",
-    "            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n",
-    "            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n",
-    "            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n",
-    "            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n",
-    "            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n",
-    "    Returns:\n",
-    "        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n",
-    "    \"\"\"\n",
-    "    cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n",
-    "    sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n",
-    "    q_embed = (q * cos) + (rotate_half(q) * sin)\n",
-    "    k_embed = (k * cos) + (rotate_half(k) * sin)\n",
-    "    return q_embed, k_embed\n",
-    "head_dim = 64\n",
-    "max_position_embeddings = 2048\n",
-    "rope_theta=10_000\n",
-    "kv_seq_len = 24\n",
-    "rotary_emb = LlamaRotaryEmbedding(\n",
-    "    head_dim,\n",
-    "    max_position_embeddings=max_position_embeddings,\n",
-    "    base=rope_theta,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Huggingface checks:\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "\n",
-      "FlexFlow checks:\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "\n",
-      "Huggingface-FlexFlow checks:\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n",
-      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
-      "  1.2096541e+01  3.6424692e+00]\n",
-      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  2.14103374e+01\n",
-      "  1.20965424e+01  3.64246750e+00]\n",
-      "[ True  True  True ...  True  True  True]\n",
-      "[2394]\n",
-      "Ok!\n",
-      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n",
-      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
-      "  1.2096541e+01  3.6424692e+00]\n",
-      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  2.14103374e+01\n",
-      "  1.20965424e+01  3.64246750e+00]\n",
-      "[ True  True  True ...  True  True  True]\n",
-      "[2394]\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "4.383680555555555% mismatch in QK prods softmax out grad\n",
-      "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n",
-      "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n",
-      "4.817708333333334% mismatch in attention input grads\n"
-     ]
-    },
-    {
-     "ename": "AssertionError",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[11], line 353\u001b[0m\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-      "\u001b[0;31mAssertionError\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "tot_num_layers = 12\n",
-    "for layer_num in range(tot_num_layers-1, -1, -1):\n",
-    "    # HuggingFace filepaths\n",
-    "    hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
-    "    hf_BWD_loraB_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.go_0\"\n",
-    "    hf_BWD_loraB_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.gi_0\"\n",
-    "    hf_BWD_loraA_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.go_0\"\n",
-    "    hf_BWD_loraA_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.gi_0\"\n",
-    "    hf_loraA_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
-    "    hf_loraB_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
-    "    hf_BWD_lora_dropout_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.go_0\"\n",
-    "    hf_BWD_lora_dropout_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.gi_0\"\n",
-    "    hf_BWD_w2_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.go_0\"\n",
-    "    hf_BWD_w2_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.gi_0\"\n",
-    "    hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n",
-    "    hf_BWD_w3_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.go_0\"\n",
-    "    hf_BWD_w3_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.gi_0\"\n",
-    "    hf_BWD_w1_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.go_0\"\n",
-    "    hf_BWD_w1_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.gi_0\"\n",
-    "    hf_BWD_act_fn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.gi_0\"\n",
-    "    hf_BWD_act_fn_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.go_0\"\n",
-    "    hf_BWD_ffn_norm_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.go_0\"\n",
-    "    hf_BWD_ffn_norm_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.gi_0\"\n",
-    "    hf_BWD_attn_out_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.go_0\"\n",
-    "    hf_BWD_attn_q_in = f\"{hf_weight_base_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n",
-    "    hf_FWD_w1_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n",
-    "    hf_FWD_w3_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\"\n",
-    "    hf_FWD_act_fn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.act_fn.output_0\"\n",
-    "    hf_BWD_attn_oproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
-    "    hf_attn_qproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.q_proj.weight\"\n",
-    "    hf_attn_kproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.k_proj.weight\"\n",
-    "    hf_attn_vproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.v_proj.weight\"\n",
-    "    hf_attn_oproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.o_proj.weight\"\n",
-    "    # hf_BWD_attn_vproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
-    "    # FlexFlow filepaths\n",
-    "    ff_BWD_w2_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n",
-    "    ff_BWD_w2_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n",
-    "    ff_BWD_w2_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_pre_input_0\"\n",
-    "    ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
-    "    ff_BWD_ssm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_output_0\"\n",
-    "    ff_BWD_ssm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_0\"\n",
-    "    ff_BWD_ssm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_1\"\n",
-    "    ff_BWD_w3_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
-    "    ff_BWD_w3_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_input_0\"\n",
-    "    ff_BWD_lora_A_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
-    "    ff_BWD_lora_B_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
-    "    ff_lora_A_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
-    "    ff_lora_B_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
-    "    ff_BWD_w1_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
-    "    ff_BWD_w1_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_input_0\"\n",
-    "    ff_BWD_w1_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_pre_input_0\"\n",
-    "    ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
-    "    ff_BWD_ffn_norm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_0\"\n",
-    "    ff_BWD_ffn_norm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_1\"\n",
-    "    ff_BWD_ffn_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_0\"\n",
-    "    ff_BWD_attn_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n",
-    "    ff_BWD_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_input_0\"\n",
-    "    ff_BWD_ssm_cached_w1_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w1_output\"\n",
-    "    ff_BWD_ssm_cached_w3_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w3_output\"\n",
-    "    ff_FWD_w1_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
-    "    ff_FWD_w3_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
-    "    ff_FWD_act_fnc_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_act_fn_output\"\n",
-    "    ff_BWD_attn_o_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
-    "    # ff_BWD_attn_v_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n",
-    "    ff_attn_oproj_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_weight_0\"\n",
-    "    # ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
-    "\n",
-    "    # xxx = torch.load(hf_BWD_attn_out_out)\n",
-    "    # xxx.detach().cpu().numpy().tofile(f\"{hf_BWD_attn_out_out}.flexflow\")\n",
-    "    # print(f\"{hf_BWD_attn_out_out}.flexflow\")\n",
-    "    \n",
-    "    # HuggingFace checks\n",
-    "    print(\"\\nHuggingface checks:\")\n",
-    "    if layer_num == tot_num_layers-1:\n",
-    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
-    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n",
-    "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
-    "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
-    "    # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n",
-    "    compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n",
-    "    check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n",
-    "    check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n",
-    "\n",
-    "    # FlexFlow checks\n",
-    "    print(\"\\nFlexFlow checks:\")\n",
-    "    compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n",
-    "    compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n",
-    "    compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n",
-    "    compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n",
-    "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
-    "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
-    "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
-    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n",
-    "    #compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768) # should fail\n",
-    "\n",
-    "    # HF-FlexFlow checks\n",
-    "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
-    "    compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n",
-    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
-    "    #print(torch.load(hf_w2_weight).shape)\n",
-    "    compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n",
-    "    compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n",
-    "\n",
-    "    compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n",
-    "    compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
-    "\n",
-    "    compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n",
-    "    compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n",
-    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
-    "    compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n",
-    "\n",
-    "    compare_tensors(hf_FWD_w1_out, ff_FWD_w1_out)\n",
-    "    compare_tensors(hf_FWD_w3_out, ff_FWD_w3_out)\n",
-    "    compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
-    "    compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n",
-    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
-    "    # compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n",
-    "    # compare_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2)\n",
-    "    # compare_tensors(hf_BWD_attn_out_out, ff_BWD_ffn_norm_in2)\n",
-    "    compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n",
-    "\n",
-    "    # compare attn weight tensors\n",
-    "    hidden_size = 768\n",
-    "    qProjSize = 64\n",
-    "    num_heads = 12\n",
-    "    num_new_tokens = num_tokens = 24\n",
-    "    ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n",
-    "    ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
-    "    ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
-    "    ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
-    "    ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n",
-    "    \n",
-    "    hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n",
-    "    hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n",
-    "    hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n",
-    "    hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n",
-    "    \n",
-    "    assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n",
-    "    assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n",
-    "    assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n",
-    "    assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n",
-    "    \n",
-    "    # Compare attn outproj grad in tensors\n",
-    "    compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n",
-    "    \n",
-    "    ########### Compare value projs grads ######################\n",
-    "    # 1. compare qk prods softmax\n",
-    "    hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.qk_prods_softmax\"\n",
-    "    ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
-    "    \n",
-    "    hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n",
-    "    ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
-    "\n",
-    "    for head_idx in range(num_heads):\n",
-    "        hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
-    "        ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
-    "        assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
-    "    \n",
-    "    # 2. compare attn heads grads\n",
-    "    hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
-    "    ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
-    "\n",
-    "    hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n",
-    "    ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n",
-    "    assert(np.allclose(ff_attn_heads_grads, hf_attn_heads_grads, atol=1e-2))\n",
-    "\n",
-    "    # 3. vproj grads\n",
-    "    hf_vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.go_0\"\n",
-    "    ff_vproj_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n",
-    "\n",
-    "    hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n",
-    "    ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n",
-    "    assert(np.allclose(hf_vproj_grads, ff_vproj_grads, atol=1e-2))\n",
-    "\n",
-    "    \n",
-    "    \n",
-    "    \n",
-    "    ##############################\n",
-    "    hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
-    "    hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
-    "    # print(hf_value_states.shape)\n",
-    "    ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n",
-    "    ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
-    "    # print(ff_value_states.shape)\n",
-    "    assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
-    "    \n",
-    "    \n",
-    "    \n",
-    "    ########## Compare key and query projs grads ##################\n",
-    "    ff_devQKVPRojArray = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
-    "    ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n",
-    "    ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n",
-    "    ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n",
-    "    ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n",
-    "    assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n",
-    "\n",
-    "    # simulate qk_prods_softmax\n",
-    "    ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
-    "    ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n",
-    "    ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n",
-    "    ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
-    "    ff_value_states = torch.from_numpy(ff_value_states)\n",
-    "    ff_value_states = ff_value_states.permute(1,0,2)\n",
-    "    # print(ff_attn_heads_grads.shape)\n",
-    "    # print(ff_value_states.shape)\n",
-    "    simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
-    "    #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
-    "    #print(\"Simulated QK prods grads:\")\n",
-    "    #print(simulated_qk_prods_softmax_grads[0,:,:])\n",
-    "\n",
-    "    # qk prods softmax right before softmax\n",
-    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.go_0\"\n",
-    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
-    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad\"\n",
-    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
-    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
-    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
-    "    # assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
-    "    mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n",
-    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
-    "    pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n",
-    "    print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n",
-    "    assert(pct_mismatch <= 0.05)\n",
-    "\n",
-    "    # qk prods softmax right after softmax\n",
-    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.gi_0\"\n",
-    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
-    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in\"\n",
-    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
-    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
-    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
-    "    assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
-    "    \n",
-    "    # qk prods softmax after mask\n",
-    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.matmul_op.go_0\"\n",
-    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
-    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in_masked\"\n",
-    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
-    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
-    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
-    "    assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
-    "\n",
-    "    # Compare query activation\n",
-    "    hf_query_activation = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.query_activation\"\n",
-    "    hf_query_activation = torch.load(hf_query_activation)\n",
-    "    ff_query_activation = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_query_activation\"\n",
-    "    ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n",
-    "    hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
-    "    assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
-    "    \n",
-    "    ########################################## ROPE and Kproj ##########################################\n",
-    "\n",
-    "    # Compare FF kproj with intermediate kproj data from HF\n",
-    "    hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n",
-    "    hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
-    "    hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
-    "    # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
-    "    # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
-    "    # Check hf ROPE \n",
-    "    cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
-    "    cos = cos.cuda()\n",
-    "    sin = sin.cuda()\n",
-    "    # query_states:  torch.Size([1, 12, 24, 64])\n",
-    "    # key_states:  torch.Size([1, 12, 24, 64])\n",
-    "    # position_ids:  torch.Size([1, 24])\n",
-    "    # tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
-    "    #          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
-    "    query_states = torch.zeros([1, 12, 24, 64]).cuda()\n",
-    "    position_ids = torch.arange(24).unsqueeze(0).cuda()\n",
-    "    query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n",
-    "    hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
-    "    # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
-    "    # print(hf_kproj_grads_post_rotary[:,:,0])\n",
-    "    \n",
-    "    hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n",
-    "    hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
-    "    hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
-    "    # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
-    "    # print(hf_kproj_grads_before_rotary[:,:,0])\n",
-    "    # Compare HF rope with manual ROPE\n",
-    "    assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
-    "    # Compare HF Kproj with FF Kproj (before ROPE) \n",
-    "    ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n",
-    "    ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
-    "    # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
-    "    #print(ff_kproj_pre[:,:,0])\n",
-    "    mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
-    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
-    "    pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
-    "    print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n",
-    "    assert(pct_mismatch <= 0.05)\n",
-    "    #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
-    "    \n",
-    "    ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n",
-    "    ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
-    "    # print(\"ff_kproj: \", ff_kproj.shape)\n",
-    "    #print(ff_kproj[:,:,0])\n",
-    "    mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
-    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
-    "    pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
-    "    print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n",
-    "    assert(pct_mismatch <= 0.05)\n",
-    "    #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
-    "    \n",
-    "    \n",
-    "    #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
-    "    hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n",
-    "    hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
-    "    #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
-    "    #print(hf_kproj_grads[:,:64])\n",
-    "    reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
-    "    #print(reshaped_tensor.shape)\n",
-    "    assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
-    "\n",
-    "    ########################################## Qproj (with ROPE) ##########################################\n",
-    "\n",
-    "    # Compare QProj\n",
-    "    hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n",
-    "    hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
-    "    # print(\"HF Qproj:\")\n",
-    "    # print(hf_qproj_grads.shape)\n",
-    "    reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
-    "    # print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
-    "    # print(reshaped_tensor[:,:,0])\n",
-    "    ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
-    "    ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
-    "    # print(\"FF Qproj:\")\n",
-    "    # print(ff_qproj.shape)\n",
-    "    # print(ff_qproj[:,:,0])\n",
-    "    assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
-    "\n",
-    "    hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n",
-    "    hf_attn_in = torch.load(hf_attn_in)\n",
-    "    # print(\"hf_attn_in: \", hf_attn_in.shape)\n",
-    "    hf_attn_in = hf_attn_in.squeeze().T\n",
-    "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
-    "    # print(\"hf_attn_in: \", hf_attn_in.shape)\n",
-    "    # print(hf_attn_in)\n",
-    "\n",
-    "    ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n",
-    "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
-    "    # print(\"ff_attn_in: \", ff_attn_in.shape)\n",
-    "    # print(ff_attn_in)\n",
-    "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
-    "\n",
-    "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
-    "    mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n",
-    "    pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n",
-    "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
-    "    assert(pct_mismatch <= 0.05)\n",
-    "    \n",
-    "\n",
-    "    assert False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.Size([12, 24, 64])\n",
-      "tensor([[-1.5730e-02, -4.1161e-02,  3.0593e-02,  ...,  3.8630e-01,\n",
-      "          3.2884e-01,  3.6067e-01],\n",
-      "        [-2.8613e+01, -5.5872e+00,  2.9385e+01,  ...,  3.8782e+01,\n",
-      "          9.6901e+01,  9.8470e+01],\n",
-      "        [ 3.3027e+00,  1.8276e-01, -1.8497e+00,  ..., -4.4052e+01,\n",
-      "         -2.0010e+01, -2.9788e+01],\n",
-      "        ...,\n",
-      "        [-7.6471e-02, -1.8892e-01,  3.6430e-01,  ..., -2.7493e-01,\n",
-      "          5.7017e-01, -1.5986e-01],\n",
-      "        [ 2.5780e+00, -1.8153e+00,  2.5088e+00,  ..., -1.0776e+01,\n",
-      "          6.2167e-01,  8.3755e-01],\n",
-      "        [-6.8324e-02,  1.7568e-01, -3.2311e-01,  ...,  3.1202e+00,\n",
-      "         -2.6652e-01, -1.1917e+00]])\n",
-      "(24, 64, 12)\n",
-      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
-      "   3.2884139e-01  3.6066702e-01]\n",
-      " [-2.8613457e+01 -5.5871558e+00  2.9384506e+01 ...  3.8781765e+01\n",
-      "   9.6900581e+01  9.8469597e+01]\n",
-      " [ 3.3027239e+00  1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n",
-      "  -2.0009745e+01 -2.9787930e+01]\n",
-      " ...\n",
-      " [-7.6470733e-02 -1.8891659e-01  3.6430117e-01 ... -2.7492592e-01\n",
-      "   5.7017130e-01 -1.5985624e-01]\n",
-      " [ 2.5780225e+00 -1.8152566e+00  2.5087588e+00 ... -1.0776262e+01\n",
-      "   6.2166649e-01  8.3755457e-01]\n",
-      " [-6.8324409e-02  1.7568478e-01 -3.2310838e-01 ...  3.1202292e+00\n",
-      "  -2.6652411e-01 -1.1917179e+00]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# value states: torch.Size([1, 12, 24, 64])\n",
-    "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n",
-    "key_states = value_states\n",
-    "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n",
-    "# query_states:  torch.Size([1, 12, 24, 64])\n",
-    "# key_states:  torch.Size([1, 12, 24, 64])\n",
-    "# position_ids:  torch.Size([1, 24])\n",
-    "# tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
-    "#          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
-    "query_states = torch.zeros([1, 12, 24, 64])\n",
-    "position_ids = torch.arange(24).unsqueeze(0)\n",
-    "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n",
-    "key_states = key_states.squeeze()\n",
-    "print(key_states.shape)\n",
-    "print(key_states[0,:,:])\n",
-    "print(hf_kproj_grads_before_rotary.shape)\n",
-    "print(hf_kproj_grads_before_rotary[:,:,0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
-       "         18, 19, 20, 21, 22, 23]], device='cuda:0')"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "torch.arange(24).unsqueeze(0).cuda()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.Size([1, 12, 24, 24])\n"
-     ]
-    },
-    {
-     "ename": "AssertionError",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n",
-      "\u001b[0;31mAssertionError\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "layer_num = 11\n",
-    "hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n",
-    "ff_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
-    "\n",
-    "hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
-    "\n",
-    "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n",
-    "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n",
-    "print(hf_qk_prods_softmax.shape)\n",
-    "#print(ff_qk_prods_softmax.shape)\n",
-    "#print(hf_qk_prods_softmax[:,:,0])\n",
-    "#print()\n",
-    "#print(ff_qk_prods_softmax[:,:,0])\n",
-    "\n",
-    "for head_idx in range(12):\n",
-    "    hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
-    "    ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
-    "    assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
-    "\n",
-    "\n",
-    "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n",
-    "print(hf_value_states.shape)\n",
-    "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n",
-    "print()\n",
-    "print(attn_output.shape)\n",
-    "print(attn_output.transpose(1, 2).contiguous().shape)\n",
-    "print(\"Hf attn heads\")\n",
-    "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n",
-    "\n",
-    "print(\"Attn heads grads:\")\n",
-    "hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
-    "print(torch.load(hf_attn_heads_grads).shape)\n",
-    "print(\"HF value grads:\")\n",
-    "vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
-    "print(torch.load(vproj_grads).shape)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.Size([2, 3, 4])\n",
-      "torch.Size([4, 3, 2])\n"
-     ]
-    }
-   ],
-   "source": [
-    "a = torch.randn(2,3,4)\n",
-    "print(a.shape)\n",
-    "print(a.T.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[[   0.0000,    0.0000,    0.0000,  ...,    0.0000,    0.0000,\n",
-      "             0.0000],\n",
-      "         [  27.8890,  -21.5089,   45.8214,  ...,    5.4010,  -10.8787,\n",
-      "            39.7619],\n",
-      "         [  19.2197,   27.4681,  -68.7141,  ...,  102.3280,   66.7925,\n",
-      "          -160.8711],\n",
-      "         ...,\n",
-      "         [  63.9532,   17.4273,  -29.4416,  ...,  101.6105,   67.5937,\n",
-      "          -198.4432],\n",
-      "         [  31.2799,   13.0724,  -44.7179,  ...,  132.4898,   42.3135,\n",
-      "          -194.4037],\n",
-      "         [  42.3453,  -16.2693,  -55.7386,  ...,   90.5921,   52.2032,\n",
-      "          -124.1802]]], device='cuda:0')\n",
-      "tensor([[[-1.1845e+06, -6.7460e+05,  7.4494e+05,  ..., -9.1441e+05,\n",
-      "          -1.4912e+05,  3.5769e+06],\n",
-      "         [-7.3920e+01, -7.9389e+01,  1.1027e+02,  ..., -7.3020e+01,\n",
-      "          -2.3540e+01,  3.4587e+02],\n",
-      "         [-5.3885e+01, -1.7373e+01, -1.9780e+01,  ...,  4.1291e+01,\n",
-      "           5.5099e+01,  5.5910e+01],\n",
-      "         ...,\n",
-      "         [-2.1948e+01, -3.2109e+01,  2.8364e+01,  ...,  3.4321e+01,\n",
-      "           5.0713e+01,  5.6592e+01],\n",
-      "         [-4.4339e+01, -2.8339e+01,  1.4070e+01,  ...,  6.2797e+01,\n",
-      "           3.0760e+01,  6.1743e+01],\n",
-      "         [-1.6287e+01, -5.0413e+01, -1.9940e+01,  ...,  4.3766e+01,\n",
-      "           4.7833e+01,  4.7295e+01]]], device='cuda:0')\n"
-     ]
-    }
-   ],
-   "source": [
-    "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n",
-    "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n",
-    "a = torch.load(a)\n",
-    "b = torch.load(b)\n",
-    "print(a)\n",
-    "print(b)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# # Manual matmul checks\n",
-    "# ff_w2_grad_out_tensor = np.loadtxt(ff_BWD_w2_out, delimiter=',').reshape((768,128), order='F')\n",
-    "# ff_w2_weight_tensor = np.loadtxt(ff_w2_weight, delimiter=',').reshape((3072,768), order='F')\n",
-    "# ff_w2_gradin_tensor = np.matmul(ff_w2_weight_tensor, ff_w2_grad_out_tensor).reshape((3072,128), order='F')\n",
-    "\n",
-    "# ff_lora_gradout_tensor = np.loadtxt(ff_BWD_lora_B_out, delimiter=',').reshape((768,128), order='F')\n",
-    "# ff_lora_A_weight_tensor = np.loadtxt(ff_lora_A_weight, delimiter=',').reshape((3072,16), order='F')\n",
-    "# ff_lora_B_weight_tensor = np.loadtxt(ff_lora_B_weight, delimiter=',').reshape((16,768), order='F')\n",
-    "# ff_lora_int_grad_tensor = np.matmul(ff_lora_B_weight_tensor, ff_lora_gradout_tensor)\n",
-    "# ff_lora_gradint_tensor = np.matmul(ff_lora_A_weight_tensor, ff_lora_int_grad_tensor)\n",
-    "\n",
-    "# # ff_w2_gradin_tensor = ff_w2_gradin_tensor + ff_lora_gradint_tensor\n",
-    "# #print(ff_w2_gradin_tensor[:,:24])\n",
-    "# print(\"calculated LORA grad in\")\n",
-    "# print(ff_lora_gradint_tensor[:,:24])\n",
-    "# # ff_BWD_w2_in_pre_tensor = np.loadtxt(ff_BWD_w2_in_pre, delimiter=',').reshape((3072,128), order='F')\n",
-    "# ff_BWD_lora_A_in_tensor = np.loadtxt(ff_BWD_lora_A_in, delimiter=',').reshape((3072,128), order='F')\n",
-    "# print(\"FlexFlow LORA grad in\")\n",
-    "# print(ff_BWD_lora_A_in_tensor[:,:24])\n",
-    "# # print(ff_BWD_w2_in_pre_tensor[:,:24])\n",
-    "# print(\"HF lora grad in\")\n",
-    "# print(torch.load(hf_BWD_loraA_in).squeeze().T.detach().cpu().numpy())\n",
-    "# compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
-    "\n",
-    "# simulate act_fn_grad\n",
-    "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')\n",
-    "# w3_fwd_out_tensor = np.loadtxt(ff_FWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n",
-    "# #print(ssm_out_grad_tensor.shape, w3_fwd_out_tensor.shape)\n",
-    "# act_fn_out_check = np.multiply(ssm_out_grad_tensor, w3_fwd_out_tensor)\n",
-    "# print(\"simulated act fn out - simulated\")\n",
-    "# print(act_fn_out_check[:,:24])\n",
-    "# print(\"simulated act fn out - HF\")\n",
-    "# print(torch.load(hf_BWD_act_fn_out).detach().cpu().numpy().squeeze().T)\n",
-    "\n",
-    "# Simulated w3_grad\n",
-    "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')[:,:24]\n",
-    "# act_fnc_out_tensor = np.loadtxt(ff_FWD_act_fnc_out, delimiter=',').reshape((3072,24), order='F')\n",
-    "# w3_out_gard_check = np.multiply(ssm_out_grad_tensor, act_fnc_out_tensor)\n",
-    "# print(\"simulated w3 out - FF\")\n",
-    "# print(w3_out_gard_check)\n",
-    "# ff_BWD_w3_out_tensor = np.loadtxt(ff_BWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n",
-    "# hf_BWD_w3_out_tensor = torch.load(hf_BWD_w3_out).detach().cpu().numpy().squeeze().T\n",
-    "# print(\"w3 out, FF\")\n",
-    "# print(ff_BWD_w3_out_tensor[:,:24])\n",
-    "# print(\"w3 out, HF\")\n",
-    "# print(hf_BWD_w3_out_tensor)\n",
-    "\n",
-    "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n",
-    "# assert False\n",
-    "# print()\n",
-    "# print()\n",
-    "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n",
-    "# print_tensors(hf_BWD_w3_in, ff_BWD_w3_in, \"w3 in\")\n",
-    "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out, \"w1 out\")\n",
-    "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in, \"w1 in\")\n",
-    "# print_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out, \"ffn norm out\")\n",
-    "# print_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2, \"ffn norm in\")\n",
-    "# print()\n",
-    "# ff_w1_out_tensor = np.loadtxt(ff_BWD_w1_out, delimiter=',').reshape((3072,128), order='F')\n",
-    "# ff_w1_in_tensor = np.loadtxt(ff_BWD_w1_in, delimiter=',').reshape((768,128), order='F')\n",
-    "# ff_w1_in_pre_tensor = np.loadtxt(ff_BWD_w1_in_pre, delimiter=',').reshape((768,128), order='F')\n",
-    "# ff_w1_only_in_tensor = ff_w1_in_tensor - ff_w1_in_pre_tensor\n",
-    "# ff_w1_weight_tensor = np.loadtxt(ff_w1_weight, delimiter=',').reshape((768,3072), order='F')\n",
-    "# ff_w1_in_check_tensor = np.matmul(ff_w1_weight_tensor, ff_w1_out_tensor)\n",
-    "# print(\"W1 in (simulated):\")\n",
-    "# print(ff_w1_in_check_tensor[:,:24])\n",
-    "# print(\"W1 in (FF):\")\n",
-    "# print(ff_w1_only_in_tensor[:,:24])\n",
-    "# print(\"W1 in (HF):\")\n",
-    "# print(torch.load(hf_BWD_w1_in).squeeze().T.detach().cpu().numpy())\n",
-    "\n",
-    "# compare_tensors_difference(hf_BWD_w2_in, ff_BWD_w2_in, ff_BWD_lora_A_in)\n",
-    "# compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
-    "#compare_hf_tensors(hf_BWD_ffn_norm_in, hf_BWD_attn_out_out)\n",
-    "# print(\"\\nw1 out:\")\n",
-    "\n",
-    "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
-    "# print(\"\\nW1 in\\n\")\n",
-    "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n",
-    "# compare_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n",
-    "# print(\"\\nffn_norm\")\n",
-    "# compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n"
-     ]
-    }
-   ],
-   "source": [
-    "for layer_num in range(12):\n",
-    "    hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
-    "    ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
-    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n",
-    "    hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
-    "    ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
-    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n",
-    "    hf_w1_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.gate_proj.weight\"\n",
-    "    ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
-    "    compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n",
-    "    hf_w3_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.up_proj.weight\"\n",
-    "    ff_w3_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n",
-    "    compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n",
-    "    hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n",
-    "    ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
-    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
-    "    "
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 7836633b30..1e0e0bd167 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -60,6 +60,7 @@ def lm_head_pre_backward_hook(module, grad_output):
 
 
 def peft_backward_hook(module, grad_input, grad_output):
+    assert(type(grad_input) == tuple and type(grad_output) == tuple)
     if len(grad_input) == 0 or len(grad_output) == 0:
         return
     assert module.name is not None and module.bwd_step is not None
@@ -95,23 +96,53 @@ def peft_forward_hook(module, input, output):
     name = module.name.replace("base_model.model.model.", "")
     print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}")
     print("Input:")
-    for i, inp in enumerate(input):
-        if type(inp) == torch.Tensor:
-            print(inp.shape)
-            torch.save(
-                inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}"
-            )
-        else:
-            print(inp)
+    if type(input) == torch.Tensor:
+        print(input.shape)
+        torch.save(
+            input, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_0"
+        )
+    elif type(input) == tuple:
+        for i, inp in enumerate(input):
+            if type(inp) == torch.Tensor:
+                print(inp.shape)
+                torch.save(
+                    inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}"
+                )
+            else:
+                print(inp)
+    else:
+        assert False
     print("Output:")
-    for i, out in enumerate(output):
-        if type(out) == torch.Tensor:
-            print(out.shape)
-            torch.save(
-                out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}"
-            )
-        else:
-            print(out)
+    if type(output) == torch.Tensor:
+        print(output.shape)
+        torch.save(
+            output, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_0"
+        )
+        # if "layer_norm" in name:
+        #     torch.save(
+        #         output.grad_fn._saved_result1, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_1"
+        #     )
+        #     torch.save(
+        #         output.grad_fn._saved_result2, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_2"
+        #     )
+    elif type(output) == tuple:
+        for i, out in enumerate(output):
+            if type(out) == torch.Tensor:
+                print(out.shape)
+                torch.save(
+                    out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}"
+                )
+                # if "layer_norm" in name:
+                #     torch.save(
+                #         out.grad_fn._saved_result1, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_1"
+                #     )
+                #     torch.save(
+                #         out.grad_fn._saved_result2, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_2"
+                #     )
+            else:
+                print(out)
+    else:
+        assert False
     # print("Forward Input/Output: ", input[0].shape, output[0].shape)
     print("===")
     module.fwd_step += 1
@@ -221,10 +252,13 @@ def main():
                 layer.register_full_backward_pre_hook(lm_head_pre_backward_hook)
         # Save any weights of interest
         for name, params in model.named_parameters():
+            simplified_name = name.replace("base_model.model.model.", "")
             if "lora" in name:
-                torch.save(params, f"./hf_peft_tensors/{name}")
+                torch.save(params, f"./hf_peft_tensors/{simplified_name}")
             if "lm_head" in name or "norm" in name:
-                torch.save(params, f"./hf_peft_tensors/{name}")
+                torch.save(params, f"./hf_peft_tensors/{simplified_name}")
+            if "down_proj" in name or "self_attn" in name:
+                torch.save(params, f"./hf_peft_tensors/{simplified_name}")
 
     # Load fine-tuning dataset
     data = load_dataset("Abirate/english_quotes")
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index ad1f903cfb..1fde4d5a50 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -92,14 +92,6 @@ def main():
     model = PeftModel.from_pretrained(model, peft_model_id)
 
     print(model)
-    for name, params in model.named_parameters():
-        print(name)
-        if (
-            name
-            == "base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight"
-        ):
-            print(params)
-    assert False
 
     # Register hooks to save tensors, if needed
     if save_peft_tensors:
diff --git a/tests/peft/qk_prods_alignment.ipynb b/tests/peft/qk_prods_alignment.ipynb
deleted file mode 100644
index c2a3644b3d..0000000000
--- a/tests/peft/qk_prods_alignment.ipynb
+++ /dev/null
@@ -1,24 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.10.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 32f0a15412eabdfb45bfce48cbd489a3e5ddbac5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 26 Jan 2024 22:45:50 -0500
Subject: [PATCH 127/198] fix legion aliasing error

---
 .../ops/add_bias_residual_layer_norm.h        |   1 +
 src/ops/add_bias_residual_layer_norm.cc       | 122 ++++++++----------
 src/ops/linear.cc                             |   2 -
 src/ops/residual_layer_norm.cc                |  51 +++-----
 src/ops/residual_rms_norm.cc                  |  58 ++++-----
 5 files changed, 98 insertions(+), 136 deletions(-)

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index 38bb825a4d..550d56c47c 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -26,6 +26,7 @@ class AddBiasResidualLayerNorm : public Op {
                            float _eps,
                            bool allocate_weights,
                            char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index a2b426ec0d..6b71279971 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -348,10 +348,13 @@ void AddBiasResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  assert(batch_outputs[0]->part == batch_inputs[0]->part);
+  assert(batch_outputs[0]->region == batch_inputs[0]->region);
   // attn output
+  // added: attn_output + attn final bias + residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
@@ -362,34 +365,27 @@ void AddBiasResidualLayerNorm::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(2, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(3, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(4, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -397,7 +393,7 @@ void AddBiasResidualLayerNorm::init_inference(
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(5, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -420,10 +416,13 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  // attn output
+  assert(outputs[0]->part == inputs[0]->part);
+  assert(outputs[0]->region == inputs[0]->region);
+  // input: attn output
+  // added: attn_output + attn final bias + residual
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(0, FID_DATA);
@@ -434,34 +433,27 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(2, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(3, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(4, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -469,7 +461,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(5, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -478,13 +470,11 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
 }
 
 /*
-  regions[0](I): attn output
-  regions[1](I): residual
-  regions[2](O): added output (attn output + final attn bias + residual)
-  regions[3](O): layer norm output
-  regions[4](I): final attn bias
-  regions[5](I): gamma
-  regions[6](I): beta
+  regions[0](I/O): attn output AND added output (attn output + final attn bias +
+  residual) regions[1](I): residual regions[2](O): layer norm output
+  regions[3](I): final attn bias
+  regions[4](I): gamma
+  regions[5](I): beta
 */
 OpMeta *AddBiasResidualLayerNorm::init_task(
     Task const *task,
@@ -545,10 +535,13 @@ FutureMap AddBiasResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
+  assert(batch_outputs[0]->part == batch_inputs[0]->part);
+  assert(batch_outputs[0]->region == batch_inputs[0]->region);
   // input
+  // added_output: input + attn bias + residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
@@ -566,20 +559,13 @@ FutureMap AddBiasResidualLayerNorm::inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(2, FID_DATA);
-  // added_output: input + attn bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(3, FID_DATA);
   // output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(3, FID_DATA);
   if (elementwise_affine) {
     // gamma
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
@@ -587,7 +573,7 @@ FutureMap AddBiasResidualLayerNorm::inference(
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(4, FID_DATA);
     if (use_bias) {
       // beta
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -595,20 +581,31 @@ FutureMap AddBiasResidualLayerNorm::inference(
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(5, FID_DATA);
     }
   }
   return runtime->execute_index_space(ctx, launcher);
 }
 
+void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  outputs[0]->parallel_is = inputs[0]->parallel_is;
+  outputs[0]->region = inputs[0]->region;
+  outputs[0]->part = inputs[0]->part;
+  outputs[0]->region_grad = inputs[0]->region_grad;
+  outputs[0]->part_grad = inputs[0]->part_grad;
+  // map output 1 to new region
+  ff.map_tensor(outputs[1], this);
+}
+
 /*
-  regions[0](I): input
+  regions[0](I): input / added output
   regions[1](I): attn bias
   regions[2](I): residual
-  regions[3](O): added output
-  regions[4](O): output
-  regions[5](I): gamma
-  regions[6](I): beta
+  regions[3](O): output
+  regions[4](I): gamma
+  regions[5](I): beta
 */
 void AddBiasResidualLayerNorm::inference_task(
     Task const *task,
@@ -626,7 +623,7 @@ void AddBiasResidualLayerNorm::inference_task(
       *((AddBiasResidualLayerNormMeta **)task->local_args);
 
   assert(regions.size() ==
-         5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
+         4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
@@ -635,9 +632,9 @@ void AddBiasResidualLayerNorm::inference_task(
   GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO(
       m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
 
   GenericTensorAccessorR gamma, beta;
 
@@ -648,9 +645,9 @@ void AddBiasResidualLayerNorm::inference_task(
   Domain residual_domain = runtime->get_index_space_domain(
       ctx, task->regions[2].region.get_index_space());
   Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[3].region.get_index_space());
+      ctx, task->regions[0].region.get_index_space());
   Domain out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[4].region.get_index_space());
+      ctx, task->regions[3].region.get_index_space());
 
   Domain gamma_domain, beta_domain;
 
@@ -675,23 +672,23 @@ void AddBiasResidualLayerNorm::inference_task(
 
   if (m->elementwise_affine) {
     gamma = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                             regions[5],
-                                             task->regions[5],
+                                             regions[4],
+                                             task->regions[4],
                                              FID_DATA,
                                              ctx,
                                              runtime);
     gamma_domain = runtime->get_index_space_domain(
-        ctx, task->regions[5].region.get_index_space());
+        ctx, task->regions[4].region.get_index_space());
 
     if (m->use_bias) {
       beta = helperGetGenericTensorAccessorRO(m->weight_type[2],
-                                              regions[6],
-                                              task->regions[6],
+                                              regions[5],
+                                              task->regions[5],
                                               FID_DATA,
                                               ctx,
                                               runtime);
       beta_domain = runtime->get_index_space_domain(
-          ctx, task->regions[6].region.get_index_space());
+          ctx, task->regions[5].region.get_index_space());
       assert(gamma_domain == beta_domain);
     }
 
@@ -723,12 +720,7 @@ void AddBiasResidualLayerNorm::inference_task(
       }
     }
     AddBiasResidualLayerNorm::save_inference_tensors_to_file(
-        m,
-        shard_id,
-        bc,
-        {input, residual},
-        weights_accessors,
-        {added_output, output});
+        m, shard_id, bc, {residual}, weights_accessors, {added_output, output});
   }
 }
 
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 209f514f65..4563673385 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -621,8 +621,6 @@ void Linear::inference_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
-  printf("INF %s\n", op_name_without_uid.c_str());
   if (bc->num_tokens == 0) {
     return;
   }
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 8563c299ab..dc302ce19c 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -358,11 +358,14 @@ void ResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  assert(batch_outputs[0]->part == batch_inputs[0]->part);
+  assert(batch_outputs[0]->region == batch_inputs[0]->region);
   int field_id = 0;
   // input
+  // added: input + residual(s)
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
@@ -382,13 +385,6 @@ void ResidualLayerNorm::init_inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -433,11 +429,14 @@ void ResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  assert(outputs[0]->part == inputs[0]->part);
+  assert(outputs[0]->region == inputs[0]->region);
   int field_id = 0;
   // input
+  // added: input + residual(s)
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
@@ -457,13 +456,6 @@ void ResidualLayerNorm::init(FFModel const &ff) {
                                                       inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
@@ -884,11 +876,14 @@ FutureMap ResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
+  assert(batch_outputs[0]->part == batch_inputs[0]->part);
+  assert(batch_outputs[0]->region == batch_inputs[0]->region);
   int field_id = 0;
   // input
+  // added: input + residual(s)
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
@@ -908,13 +903,6 @@ FutureMap ResidualLayerNorm::inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -956,7 +944,7 @@ void ResidualLayerNorm::inference_task(
   }
 
   assert(regions.size() ==
-         4 + m->use_two_residuals +
+         3 + m->use_two_residuals +
              (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
 
   int region_idx = 0, task_region_idx = 0;
@@ -984,13 +972,8 @@ void ResidualLayerNorm::inference_task(
                                          ctx,
                                          runtime);
   }
-  GenericTensorAccessorW added_output =
-      helperGetGenericTensorAccessorWO(m->output_type[0],
-                                       regions[region_idx++],
-                                       task->regions[task_region_idx++],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
+  GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(m->output_type[1],
                                        regions[region_idx++],
@@ -1029,7 +1012,7 @@ void ResidualLayerNorm::inference_task(
     assert(residual2_domain == in_domain);
   }
   Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[task_region_idx++].region.get_index_space());
+      ctx, task->regions[0].region.get_index_space());
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[task_region_idx++].region.get_index_space());
   Domain gamma_domain, beta_domain;
@@ -1069,7 +1052,7 @@ void ResidualLayerNorm::inference_task(
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     std::vector<GenericTensorAccessorR> input_accessors;
-    input_accessors.push_back(input);
+    // input_accessors.push_back(input);
     input_accessors.push_back(residual1);
     if (m->use_two_residuals) {
       input_accessors.push_back(residual2);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index c2fbe11544..fb0944cece 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -261,6 +261,8 @@ void ResidualRMSNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  assert(outputs[0]->part == inputs[0]->part);
+  assert(outputs[0]->region == inputs[0]->region);
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -273,24 +275,18 @@ void ResidualRMSNorm::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(3, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -318,9 +314,11 @@ void ResidualRMSNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  assert(batch_outputs[0]->part == batch_inputs[0]->part);
+  assert(batch_outputs[0]->region == batch_inputs[0]->region);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
@@ -330,24 +328,18 @@ void ResidualRMSNorm::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(3, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -398,6 +390,8 @@ FutureMap
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
+  assert(batch_outputs[0]->part == batch_inputs[0]->part);
+  assert(batch_outputs[0]->region == batch_inputs[0]->region);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -410,40 +404,33 @@ FutureMap
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
   launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(3, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
-  regions[0](I): input1
+  regions[0](I/O): input1 / residual output
   regions[1](I): input2
-  regions[2](O): residual output
-  regions[3](O): output
-  regions[4](I/O): weight
+  regions[2](O): output
+  regions[3](I/O): weight
 */
 void ResidualRMSNorm::inference_task(Task const *task,
                                      std::vector<PhysicalRegion> const &regions,
                                      Context ctx,
                                      Runtime *runtime) {
-  assert(task->regions.size() == 5);
-  assert(regions.size() == 5);
+  assert(task->regions.size() == 4);
+  assert(regions.size() == 4);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
@@ -453,19 +440,20 @@ void ResidualRMSNorm::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  // residual_output is mapped to the same region as the input
   GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+      m->output_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   inference_kernel_wrapper(
       m, bc, input1, input2, weight, residual_output, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     ResidualRMSNorm::save_inference_tensors_to_file(
-        m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output});
+        m, shard_id, bc, {input2}, {weight}, {residual_output, output});
   }
 }
 

From c97f63a368b22363b26667a6a963fee0170aea60 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 26 Jan 2024 23:45:04 -0500
Subject: [PATCH 128/198] fix warnings

---
 src/ops/lora_linear.cc        | 14 ++++++++++++--
 src/ops/residual_rms_norm.cc  |  4 ++--
 src/ops/rms_norm.cc           |  2 +-
 src/ops/sigmoid_silu_multi.cc |  4 ++--
 src/runtime/model.cc          |  6 +++---
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index c02bddc5a6..409c814329 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -272,8 +272,6 @@ void load_peft_from_file(DT *ptr,
                          size_t size,
                          int shard_id,
                          std::string filepath) {
-  std::cout << "Loading LORA weight " << filepath << ", size: " << size
-            << ", shard: " << shard_id << std::endl;
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
   if (!in.good()) {
     printf("Could not open file: %s\n", filepath.c_str());
@@ -360,13 +358,25 @@ void LoraLinear::register_model_task(Task const *task,
   std::string w1_filepath =
       join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"});
   if (dt == DT_FLOAT) {
+    std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
+              << ", size: " << w0_num_elements << ", shard: " << shard_id
+              << std::endl;
     load_peft_from_file(
         (float *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath);
+    std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
+              << ", size: " << w1_num_elements << ", shard: " << shard_id
+              << std::endl;
     load_peft_from_file(
         (float *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath);
   } else if (dt == DT_HALF) {
+    std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
+              << ", size: " << w0_num_elements << ", shard: " << shard_id
+              << std::endl;
     load_peft_from_file(
         (half *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath);
+    std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
+              << ", size: " << w1_num_elements << ", shard: " << shard_id
+              << std::endl;
     load_peft_from_file(
         (half *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath);
   } else {
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index fb0944cece..e549e5f6da 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -639,7 +639,7 @@ Legion::FutureMap
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
@@ -647,7 +647,7 @@ Legion::FutureMap
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(2, FID_DATA);
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index a1749d66af..b9c9206a00 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -548,7 +548,7 @@ Legion::FutureMap
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index c01f47aa21..c9f86c42cb 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -384,7 +384,7 @@ FutureMap
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
@@ -392,7 +392,7 @@ FutureMap
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(2, FID_DATA);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 812a432ef1..9512a0c21a 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5546,7 +5546,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<ResidualRMSNorm::inference_task>(
-          registrar, "RMS Norm Inference Task");
+          registrar, "Residual RMS Norm Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
@@ -5562,7 +5562,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<ResidualRMSNorm::backward_task>(
-          registrar, "RMS Norm Backward Task");
+          registrar, "Residual RMS Norm Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
@@ -5577,7 +5577,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<ResidualRMSNorm::peft_bwd_task>(
-          registrar, "RMS Norm PEFT Backward Task");
+          registrar, "Residual RMS Norm PEFT Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;

From 3d5a37c70cfb76485b35a6669b4ee90f97476bb9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 27 Jan 2024 17:31:45 -0500
Subject: [PATCH 129/198] fix

---
 include/flexflow/ops/kernels/lora_linear_kernels.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index cf03e518fa..739b94ed22 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -19,7 +19,6 @@ class LoraLinearMeta : public OpMeta {
 public:
   LoraLinearMeta(FFHandler handle, LoraLinear const *li);
   ~LoraLinearMeta(void);
-  char op_name[MAX_OPNAME];
   // PEFT related fields
   void *low_rank_activation;
   void *input_activation;

From 571f0d375a6fde72267a72ded40878706ab8ab17 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 29 Jan 2024 05:39:00 +0000
Subject: [PATCH 130/198] fix pipeline parallelism

---
 src/runtime/inference_manager.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 4f7d0c9632..e82347c981 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -276,9 +276,9 @@ void InferenceManager::init_operators_inference(FFModel *model) {
         assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
         assert(tensor_buffer[op->outputs[i]].size() > batch_index);
         outputs[i] = tensor_buffer[op->outputs[i]][batch_index];
-        if (i > 0) {
-          assert(outputs[0]->machine_view == outputs[i]->machine_view);
-        }
+        // if (i > 0) {
+        //   assert(outputs[0]->machine_view == outputs[i]->machine_view);
+        // }
         assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
       }
       if (op->is_parallel_op()) {

From f4a10f3316d0d9f41f2b1dcad97a1618840cfc51 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 29 Jan 2024 06:57:15 +0000
Subject: [PATCH 131/198] fix tp issue in combine op

---
 src/parallel_ops/combine.cc | 3 ---
 src/runtime/model.cc        | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 3433e2f21b..8411b42602 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -84,9 +84,6 @@ Combine::Combine(FFModel &model,
     dims[i] = _input->dims[i];
   }
   assert(combine_degree > 0 && "Must use combine_degree > 0");
-  std::cout << "combine_dim : " << combine_dim
-            << ", dims[combine_dim].degree: " << dims[combine_dim].degree
-            << ", combine_degree: " << combine_degree << std::endl;
   assert(dims[combine_dim].degree % combine_degree == 0);
   dims[combine_dim].degree /= combine_degree;
   ParallelTensorBase::update_parallel_ids(numdim, dims);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 9512a0c21a..81cf3d966d 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3283,11 +3283,11 @@ void FFModel::create_operators_from_layers() {
       inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]);
     }
     Op *op = nullptr;
-    // add a combine before arg_topk
+    // add a combine before arg_topk / argmax
     if (config.computationMode == COMP_MODE_INFERENCE &&
         config.tensor_parallelism_degree > 1 &&
-        (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
-         l->op_type == OP_ARGMAX)) {
+        (layer_idx == layers.size() - 1 &&
+         (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX))) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,

From ca683f7fca21997e9b3c61a9f331ed6ca1c4ec81 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 29 Jan 2024 07:33:22 +0000
Subject: [PATCH 132/198] fix lora weight loading with tensor parallelism

---
 src/ops/lora_linear.cc           | 11 ++++++-----
 src/runtime/inference_manager.cc | 22 ++++++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 409c814329..81dc2292f6 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -270,6 +270,7 @@ void LoraLinear::register_peft_model(
 template <typename DT>
 void load_peft_from_file(DT *ptr,
                          size_t size,
+                         bool sharded,
                          int shard_id,
                          std::string filepath) {
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
@@ -279,7 +280,7 @@ void load_peft_from_file(DT *ptr,
   assert(in.good() && "incorrect weight file path");
   std::vector<DT> host_array(size);
   size_t target_data_size = sizeof(DT) * size;
-  in.seekg(shard_id * target_data_size, in.beg);
+  in.seekg(sharded * shard_id * target_data_size, in.beg);
   in.read((char *)host_array.data(), target_data_size);
 
   size_t in_get_size = in.gcount();
@@ -362,23 +363,23 @@ void LoraLinear::register_model_task(Task const *task,
               << ", size: " << w0_num_elements << ", shard: " << shard_id
               << std::endl;
     load_peft_from_file(
-        (float *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath);
+        (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
     std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
               << ", size: " << w1_num_elements << ", shard: " << shard_id
               << std::endl;
     load_peft_from_file(
-        (float *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath);
+        (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
   } else if (dt == DT_HALF) {
     std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
               << ", size: " << w0_num_elements << ", shard: " << shard_id
               << std::endl;
     load_peft_from_file(
-        (half *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath);
+        (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
     std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
               << ", size: " << w1_num_elements << ", shard: " << shard_id
               << std::endl;
     load_peft_from_file(
-        (half *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath);
+        (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
   } else {
     assert(false && "Data type not supported");
   }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index e82347c981..9fe9066d6c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -390,12 +390,22 @@ void InferenceManager::peft_bwd(FFModel *model,
   while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
     last_op -= 1;
   }
-  // Assert that the previous operator must be softmax
-  assert(model->operators[last_op]->op_type == OP_SOFTMAX ||
-         model->operators[last_op]->op_type == OP_FUSED);
-  if (model->operators[last_op]->op_type == OP_FUSED) {
-    FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
-    assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
+  if (model->config.tensor_parallelism_degree > 1) {
+    if (model->operators[last_op]->op_type == OP_FUSED) {
+      FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
+      assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_COMBINE);
+      assert(fused_op->op_op_type[fused_op->numOperators - 2] == OP_SOFTMAX);
+    } else {
+      assert(model->operators[last_op]->op_type == OP_COMBINE)
+      assert(model->operators[last_op-1]->op_type == OP_SOFTMAX)
+    }
+  } else {
+    // Assert that the previous operator must be softmax
+    assert(model->operators[last_op]->op_type == OP_SOFTMAX || model->operators[last_op]->op_type == OP_FUSED);
+    if (model->operators[last_op]->op_type == OP_FUSED) {
+      FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
+      assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
+    }
   }
   for (int o = last_op; o >= 0; o--) {
     Op *op = model->operators[o];

From 378bdb5ba157f18d528c65aa0c7a7dba2ec26c08 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 29 Jan 2024 07:45:10 +0000
Subject: [PATCH 133/198] fixes, implement Combine::peft_bwd_task

---
 include/flexflow/model.h                |  1 +
 include/flexflow/parallel_ops/combine.h |  9 +++
 src/ops/lora_linear.cc                  |  7 +--
 src/parallel_ops/combine.cc             | 76 +++++++++++++++++++++++++
 src/runtime/inference_manager.cc        |  7 ++-
 src/runtime/model.cc                    | 15 +++++
 6 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 851fac94d2..73c985f757 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -240,6 +240,7 @@ enum TaskIDs {
   COMBINE_INIT_TASK_ID,
   COMBINE_FWD_TASK_ID,
   COMBINE_BWD_TASK_ID,
+  COMBINE_PEFT_BWD_TASK_ID,
   REPLICATE_INIT_TASK_ID,
   REPLICATE_FWD_TASK_ID,
   REPLICATE_BWD_TASK_ID,
diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h
index 2e4fdb86a9..cca34de119 100644
--- a/include/flexflow/parallel_ops/combine.h
+++ b/include/flexflow/parallel_ops/combine.h
@@ -40,6 +40,11 @@ class Combine : public ParallelOp {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
@@ -56,6 +61,10 @@ class Combine : public ParallelOp {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   template <typename T>
   static void
       forward_task_with_type(Legion::Task const *task,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 81dc2292f6..366eca27b7 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -268,11 +268,8 @@ void LoraLinear::register_peft_model(
 }
 
 template <typename DT>
-void load_peft_from_file(DT *ptr,
-                         size_t size,
-                         bool sharded,
-                         int shard_id,
-                         std::string filepath) {
+void load_peft_from_file(
+    DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
   if (!in.good()) {
     printf("Could not open file: %s\n", filepath.c_str());
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 8411b42602..7d56d7e46b 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -275,6 +275,47 @@ void Combine::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap Combine::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
 void Combine::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -397,6 +438,41 @@ void Combine::forward_task_with_type(Task const *task,
   forward_kernel<DT>(input_ptr, output_ptr, output_domain.get_volume());
 }
 
+void Combine::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  CombineMeta const *m = *((CombineMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  DataType data_type = output_grad.data_type;
+  assert(input_grad.data_type == data_type);
+  assert(output_grad.domain == input_grad.domain);
+  if (data_type == DT_FLOAT) {
+    backward_kernel<float>(output_grad.get_float_ptr(),
+                           input_grad.get_float_ptr(),
+                           output_grad.domain.get_volume());
+  } else if (data_type == DT_DOUBLE) {
+    backward_kernel<double>(output_grad.get_double_ptr(),
+                            input_grad.get_double_ptr(),
+                            output_grad.domain.get_volume());
+  } else if (data_type == DT_INT32) {
+    backward_kernel<int32_t>(output_grad.get_int32_ptr(),
+                             input_grad.get_int32_ptr(),
+                             output_grad.domain.get_volume());
+  } else if (data_type == DT_INT64) {
+    backward_kernel<int64_t>(output_grad.get_int64_ptr(),
+                             input_grad.get_int64_ptr(),
+                             output_grad.domain.get_volume());
+  } else {
+    assert(false && "Unsupported data type in Combine backward");
+  }
+}
+
 void Combine::backward_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 9fe9066d6c..ae3b7eaa14 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -396,12 +396,13 @@ void InferenceManager::peft_bwd(FFModel *model,
       assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_COMBINE);
       assert(fused_op->op_op_type[fused_op->numOperators - 2] == OP_SOFTMAX);
     } else {
-      assert(model->operators[last_op]->op_type == OP_COMBINE)
-      assert(model->operators[last_op-1]->op_type == OP_SOFTMAX)
+      assert(model->operators[last_op]->op_type == OP_COMBINE);
+      assert(model->operators[last_op - 1]->op_type == OP_SOFTMAX);
     }
   } else {
     // Assert that the previous operator must be softmax
-    assert(model->operators[last_op]->op_type == OP_SOFTMAX || model->operators[last_op]->op_type == OP_FUSED);
+    assert(model->operators[last_op]->op_type == OP_SOFTMAX ||
+           model->operators[last_op]->op_type == OP_FUSED);
     if (model->operators[last_op]->op_type == OP_FUSED) {
       FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
       assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 81cf3d966d..42283f570e 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6726,6 +6726,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Combine::backward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(COMBINE_PEFT_BWD_TASK_ID,
+                                   "Combine PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Combine::peft_bwd_task>(
+          registrar, "Combine PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Combine::peft_bwd_task>(registrar);
+    }
+  }
   // Replicate
   {
     TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init");

From afdae452ad1502f4f1d4ad01ca2d19380ad0fc22 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 29 Jan 2024 07:52:33 +0000
Subject: [PATCH 134/198] fix

---
 src/parallel_ops/combine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 7d56d7e46b..7260a2745e 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -313,7 +313,7 @@ FutureMap Combine::peft_bwd(FFModel const &ff,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+  return runtime->execute_index_space(ctx, launcher);
 }
 
 void Combine::backward(FFModel const &ff) {

From 5660f55d8e60ccebfb02a71255ede13e4e8fdf83 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 29 Jan 2024 08:50:55 +0000
Subject: [PATCH 135/198] replicate peft bwd

---
 include/flexflow/model.h                  |  1 +
 include/flexflow/parallel_ops/replicate.h |  9 ++++
 src/parallel_ops/replicate.cc             | 65 +++++++++++++++++++++++
 src/runtime/model.cc                      | 14 +++++
 4 files changed, 89 insertions(+)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 73c985f757..974a079ddb 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -244,6 +244,7 @@ enum TaskIDs {
   REPLICATE_INIT_TASK_ID,
   REPLICATE_FWD_TASK_ID,
   REPLICATE_BWD_TASK_ID,
+  REPLICATE_PEFT_BWD_TASK_ID,
   REDUCTION_INIT_TASK_ID,
   REDUCTION_FWD_TASK_ID,
   REDUCTION_BWD_TASK_ID,
diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h
index 65d69d8564..c27616634f 100644
--- a/include/flexflow/parallel_ops/replicate.h
+++ b/include/flexflow/parallel_ops/replicate.h
@@ -54,10 +54,19 @@ class Replicate : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_kernel_wrapper(ReplicateMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &output,
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index 20face74e8..701db40b49 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -273,6 +273,45 @@ void Replicate::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap Replicate::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void Replicate::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -378,6 +417,32 @@ void Replicate::forward_task(Task const *task,
   }
 }
 
+void Replicate::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  Domain output_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain input_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  // Currently only support the outter most dimension
+  for (int i = 0; i < output_grad_domain.get_dim() - 1; i++) {
+    assert(output_grad_domain.lo()[i] == input_grad_domain.lo()[i]);
+    assert(output_grad_domain.hi()[i] == input_grad_domain.hi()[i]);
+  }
+  size_t num_elements = input_grad_domain.get_volume();
+  size_t num_replicas = output_grad_domain.get_volume() / num_elements;
+  float const *output_grad_ptr = helperGetTensorPointerRO<float>(
+      regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  float *input_grad_ptr = helperGetTensorPointerRW<float>(
+      regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  backward_kernel<float>(
+      output_grad_ptr, input_grad_ptr, num_elements, num_replicas);
+}
+
 void Replicate::backward_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 42283f570e..11311053e9 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6784,6 +6784,20 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Replicate::backward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, "Replicate PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Replicate::peft_bwd_task>(
+          registrar, "Replicate PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Replicate::peft_bwd_task>(registrar);
+    }
+  }
   // Reduction
   {
     TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init");

From a9bacd31ab937a364ec926c9339f970c9e918b6c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 30 Jan 2024 05:54:43 +0000
Subject: [PATCH 136/198] fixes

---
 .../flexflow/ops/kernels/softmax_kernels.h    |  1 +
 include/flexflow/parallel_ops/parallel_op.h   |  2 +-
 src/ops/fused.cu                              |  1 +
 src/ops/kernels/softmax.cu                    | 25 ++++++----
 src/ops/softmax.cc                            | 46 +++++++++++++------
 src/parallel_ops/combine.cc                   | 30 +++++++++++-
 src/parallel_ops/partition.cc                 |  5 ++
 src/parallel_ops/reduction.cc                 |  7 +++
 src/parallel_ops/replicate.cc                 | 36 +++++++++------
 src/runtime/model.cc                          |  3 +-
 10 files changed, 114 insertions(+), 42 deletions(-)

diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index b3dfe4f430..0b7f1090f6 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -38,6 +38,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
+                              bool is_last_op,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output,
                               GenericTensorAccessorW const &output_grad);
diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h
index 0bf573996c..39324c2a51 100644
--- a/include/flexflow/parallel_ops/parallel_op.h
+++ b/include/flexflow/parallel_ops/parallel_op.h
@@ -41,7 +41,7 @@ class ParallelOp : public Op {
 public:
   Legion::LogicalPartition input_lp, output_grad_lp;
   std::unordered_map<ParallelTensor, Legion::LogicalPartition>
-      inference_input_lps;
+      inference_input_lps, inference_output_grad_lps;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index f6bed71f6a..55892ab7e9 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -642,6 +642,7 @@ __host__ void
         Kernels::Softmax::inference_kernel_wrapper(
             m,
             bc,
+            (op == fused->numOperators - 1),
             my_input_accessor[0],
             my_output_accessor[0],
             output_accessor[fused->numOutputs]);
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 271a291b09..c8bc242af0 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -120,6 +120,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
+                              bool is_last_op,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output,
                               GenericTensorAccessorW const &output_grad) {
@@ -139,11 +140,13 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_float_ptr(),
                                num_classes,
                                stream);
-    checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(),
-                              output.get_float_ptr(),
-                              output.domain.get_volume() * sizeof(float),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    if (is_last_op) {
+      checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(),
+                                output.get_float_ptr(),
+                                output.domain.get_volume() * sizeof(float),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
   } else if (m->output_type[0] == DT_HALF) {
     Internal::inference_kernel(m,
                                bc,
@@ -151,11 +154,13 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_half_ptr(),
                                num_classes,
                                stream);
-    checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(),
-                              output.get_half_ptr(),
-                              output.domain.get_volume() * sizeof(half),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    if (is_last_op) {
+      checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(),
+                                output.get_half_ptr(),
+                                output.domain.get_volume() * sizeof(half),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 1d062b552b..cfc3cf6e40 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -355,14 +355,25 @@ FutureMap Softmax::inference(FFModel const &ff,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
-  // we add the region below in order to copy the output to the grad tensor
-  launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad,
-                        0 /*projection id*/,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        batch_outputs[0]->region_grad));
-  launcher.add_field(2, FID_DATA);
+  // if this is the last operator, we add the region below in order to copy the
+  // output to the grad tensor
+  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
+  int last_op = ff.operators.size() - 1;
+  assert(ff.operators[last_op]->op_type == OP_ARGMAX ||
+         ff.operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  if (ff.operators[last_op] == this) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(2, FID_DATA);
+  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -371,8 +382,8 @@ void Softmax::inference_task(Task const *task,
                              Context ctx,
                              Runtime *runtime) {
   assert(task->regions.size() == regions.size());
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
+  assert(regions.size() == 3 || regions.size() == 2);
+  bool is_last_op = (regions.size() == 3);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
@@ -384,9 +395,16 @@ void Softmax::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  inference_kernel_wrapper(m, bc, input, output, output_grad);
+  GenericTensorAccessorW output_grad;
+  if (is_last_op) {
+    output_grad = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                   regions[2],
+                                                   task->regions[2],
+                                                   FID_DATA,
+                                                   ctx,
+                                                   runtime);
+  }
+  inference_kernel_wrapper(m, bc, is_last_op, input, output, output_grad);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -428,7 +446,7 @@ FutureMap Softmax::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_outputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        READ_ONLY,
                         EXCLUSIVE,
                         batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 7260a2745e..737998b141 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -205,6 +205,11 @@ void Combine::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 FutureMap Combine::inference(FFModel const &ff,
@@ -244,6 +249,25 @@ FutureMap Combine::inference(FFModel const &ff,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
+  // if this is the last operator, we add the region below in order to copy the
+  // output to the grad tensor
+  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
+  int last_op = ff.operators.size() - 1;
+  assert(ff.operators[last_op]->op_type == OP_ARGMAX ||
+         ff.operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  if (ff.operators[last_op] == this) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(2, FID_DATA);
+  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -300,7 +324,7 @@ FutureMap Combine::peft_bwd(FFModel const &ff,
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[0]->part_grad,
+      RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
                         0 /*projection id*/,
                         READ_ONLY,
                         EXCLUSIVE,
@@ -309,7 +333,7 @@ FutureMap Combine::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        WRITE_ONLY,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
@@ -400,6 +424,7 @@ void Combine::forward_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
                            Context ctx,
                            Runtime *runtime) {
+  printf("INF combine\n");
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
   CombineMeta const *m = *((CombineMeta **)task->local_args);
@@ -442,6 +467,7 @@ void Combine::peft_bwd_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
                             Runtime *runtime) {
+  printf("BWD combine\n");
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
   CombineMeta const *m = *((CombineMeta **)task->local_args);
diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc
index 353b3ce398..df3c56346c 100644
--- a/src/parallel_ops/partition.cc
+++ b/src/parallel_ops/partition.cc
@@ -197,6 +197,11 @@ void Repartition::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 FutureMap
diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc
index 5dca591328..2e7b4b6723 100644
--- a/src/parallel_ops/reduction.cc
+++ b/src/parallel_ops/reduction.cc
@@ -122,6 +122,13 @@ void Reduction::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  // output_grad_lp is an aliased partitioning along the replica dim
+  ff.create_aliased_partition(batch_inputs[0]->num_dims,
+                              batch_inputs[0]->dims,
+                              reduction_dim,
+                              batch_inputs[0]->parallel_is,
+                              batch_outputs[0]->region_grad,
+                              inference_output_grad_lps[batch_outputs[0]]);
 }
 
 OpMeta *Reduction::init_task(Task const *task,
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index 701db40b49..e4f19faa0a 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -122,6 +122,12 @@ void Replicate::create_input_partition_inference(
                               batch_outputs[0]->parallel_is,
                               batch_inputs[0]->region,
                               inference_input_lps[batch_inputs[0]]);
+  // output_grad_lp is a disjoint partition
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 OpMeta *Replicate::init_task(Task const *task,
@@ -274,10 +280,10 @@ void Replicate::forward(FFModel const &ff) {
 }
 
 FutureMap Replicate::peft_bwd(FFModel const &ff,
-                            BatchConfigFuture const &bc,
-                            std::vector<ParallelTensor> const &batch_inputs,
-                            std::vector<ParallelTensor> const &batch_outputs,
-                            MachineView const *mv) {
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
@@ -297,17 +303,19 @@ FutureMap Replicate::peft_bwd(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 11311053e9..eca8c31785 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6785,7 +6785,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, "Replicate PEFT Backward");
+    TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID,
+                                   "Replicate PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {

From f3a97ff3832261393b4c7f0c6231fe292b4964c9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 31 Jan 2024 02:45:49 +0000
Subject: [PATCH 137/198] fix

---
 src/parallel_ops/combine.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 737998b141..354faa5e1a 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -261,7 +261,7 @@ FutureMap Combine::inference(FFModel const &ff,
   }
   if (ff.operators[last_op] == this) {
     launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part_grad,
+        RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
                           0 /*projection id*/,
                           WRITE_ONLY,
                           EXCLUSIVE,
@@ -424,9 +424,8 @@ void Combine::forward_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
                            Context ctx,
                            Runtime *runtime) {
-  printf("INF combine\n");
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
+  // assert(regions.size() == 2);
+  // assert(task->regions.size() == 2);
   CombineMeta const *m = *((CombineMeta **)task->local_args);
   DataType data_type = m->input_type[0];
   if (data_type == DT_HALF) {

From e0a58bb73364660be05aa8162e960399b5f9d557 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 31 Jan 2024 23:48:02 +0000
Subject: [PATCH 138/198] fix combine and fwd-bwd pass dependencies

---
 include/flexflow/model.h         |  1 +
 src/ops/softmax.cc               |  1 +
 src/parallel_ops/allreduce.cc    |  2 +-
 src/parallel_ops/combine.cc      | 60 +++++++++++++++-----------------
 src/runtime/inference_manager.cc | 24 ++++---------
 src/runtime/model.cc             | 36 ++++++++++++++++---
 6 files changed, 69 insertions(+), 55 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 974a079ddb..b3a6a85808 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1124,6 +1124,7 @@ class FFModel {
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
+  bool need_to_add_combine(int layer_idx) const;
   bool is_mlp_block(int layer_idx) const;
   void create_operators_from_layers();
   Op *create_operator_from_layer(Layer *layer,
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index cfc3cf6e40..90f77ab760 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -360,6 +360,7 @@ FutureMap Softmax::inference(FFModel const &ff,
   assert(ff.config.computationMode == COMP_MODE_INFERENCE);
   int last_op = ff.operators.size() - 1;
   assert(ff.operators[last_op]->op_type == OP_ARGMAX ||
+         ff.operators[last_op]->op_type == OP_ARG_TOPK ||
          ff.operators[last_op]->op_type == OP_SAMPLING);
   last_op -= 1;
   while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 4478a2aedc..05c2761e3b 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -365,7 +365,7 @@ FutureMap AllReduce::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        WRITE_ONLY,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 354faa5e1a..a328ec7cac 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -199,12 +199,18 @@ void Combine::create_input_partition_inference(
   assert(ff.config.computationMode == COMP_MODE_INFERENCE);
   assert(batch_outputs[0]->part != LogicalPartition::NO_PART);
   assert(batch_inputs[0]->part != LogicalPartition::NO_PART);
-  // input_lp is a disjoint partition
+  // partition batch_inputs[0]->region into inference_input_lps[batch_inputs[0]]
+  // according to the partitioning of batch_outputs[0] (i.e. make the
+  // partitioned dimension whole again by combining the partitions)
   ff.create_disjoint_partition(batch_outputs[0]->num_dims,
                                batch_outputs[0]->dims,
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  // partition batch_outputs[0]->region_grad into
+  // inference_output_grad_lps[batch_outputs[0]] according to the partitioning
+  // of batch_inputs[0] (i.e. restore the partition in the dimension that was
+  // combined in the forward pass)
   ff.create_disjoint_partition(batch_inputs[0]->num_dims,
                                batch_inputs[0]->dims,
                                batch_inputs[0]->parallel_is,
@@ -249,25 +255,6 @@ FutureMap Combine::inference(FFModel const &ff,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
-  // if this is the last operator, we add the region below in order to copy the
-  // output to the grad tensor
-  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
-  int last_op = ff.operators.size() - 1;
-  assert(ff.operators[last_op]->op_type == OP_ARGMAX ||
-         ff.operators[last_op]->op_type == OP_SAMPLING);
-  last_op -= 1;
-  while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
-    last_op -= 1;
-  }
-  if (ff.operators[last_op] == this) {
-    launcher.add_region_requirement(
-        RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region_grad));
-    launcher.add_field(2, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -310,23 +297,28 @@ FutureMap Combine::peft_bwd(FFModel const &ff,
   assert(numOutputs == 1);
   assert(numInputs == 1);
   assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
-  DataType data_type = batch_inputs[0]->data_type;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  DataType data_type = inputs[0]->data_type;
+
+  // Warning: we need to use batch_inputs[0] here, instead of the usual
+  // batch_outputs[0]
+  parallel_is = batch_inputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view;
+
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
   IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID,
                          parallel_is,
-                         TaskArgument(NULL, 0),
+                         TaskArgument(&data_type, sizeof(DataType)),
                          argmap,
                          Predicate::TRUE_PRED,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
                         0 /*projection id*/,
-                        READ_ONLY,
+                        READ_WRITE,
                         EXCLUSIVE,
                         batch_outputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
@@ -424,8 +416,8 @@ void Combine::forward_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
                            Context ctx,
                            Runtime *runtime) {
-  // assert(regions.size() == 2);
-  // assert(task->regions.size() == 2);
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
   CombineMeta const *m = *((CombineMeta **)task->local_args);
   DataType data_type = m->input_type[0];
   if (data_type == DT_HALF) {
@@ -466,15 +458,19 @@ void Combine::peft_bwd_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
                             Runtime *runtime) {
-  printf("BWD combine\n");
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
-  CombineMeta const *m = *((CombineMeta **)task->local_args);
+  // CombineMeta const *m = *((CombineMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  // TODO: figure out why m->output_type[0] or m->input_type[0] are not working
+  DataType data_type = *((DataType *)task->args);
   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+      data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  DataType data_type = output_grad.data_type;
+      data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
   assert(input_grad.data_type == data_type);
   assert(output_grad.domain == input_grad.domain);
   if (data_type == DT_FLOAT) {
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index ae3b7eaa14..066701f65c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -385,28 +385,18 @@ void InferenceManager::peft_bwd(FFModel *model,
   int last_op = model->operators.size() - 1;
   // Assert that the last operator must be argmax or sampling
   assert(model->operators[last_op]->op_type == OP_ARGMAX ||
+         model->operators[last_op]->op_type == OP_ARG_TOPK ||
          model->operators[last_op]->op_type == OP_SAMPLING);
   last_op -= 1;
   while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
     last_op -= 1;
   }
-  if (model->config.tensor_parallelism_degree > 1) {
-    if (model->operators[last_op]->op_type == OP_FUSED) {
-      FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
-      assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_COMBINE);
-      assert(fused_op->op_op_type[fused_op->numOperators - 2] == OP_SOFTMAX);
-    } else {
-      assert(model->operators[last_op]->op_type == OP_COMBINE);
-      assert(model->operators[last_op - 1]->op_type == OP_SOFTMAX);
-    }
-  } else {
-    // Assert that the previous operator must be softmax
-    assert(model->operators[last_op]->op_type == OP_SOFTMAX ||
-           model->operators[last_op]->op_type == OP_FUSED);
-    if (model->operators[last_op]->op_type == OP_FUSED) {
-      FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
-      assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
-    }
+  // Assert that the previous operator must be softmax
+  assert(model->operators[last_op]->op_type == OP_SOFTMAX ||
+         model->operators[last_op]->op_type == OP_FUSED);
+  if (model->operators[last_op]->op_type == OP_FUSED) {
+    FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
+    assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
   }
   for (int o = last_op; o >= 0; o--) {
     Op *op = model->operators[o];
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index eca8c31785..6d77730e47 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3270,6 +3270,34 @@ bool FFModel::is_mlp_block(int layer_idx) const {
   return false;
 }
 
+bool FFModel::need_to_add_combine(int layer_idx) const {
+  if (config.computationMode != COMP_MODE_INFERENCE ||
+      config.tensor_parallelism_degree == 1 || layers.size() <= 2) {
+    return false;
+  }
+  auto const &l = layers[layer_idx];
+  // softmax followed by argmax/arg_topk: add combine before softmax
+  if (layer_idx == layers.size() - 2) {
+    auto const &l_next = layers[layer_idx + 1];
+    if (l->op_type == OP_SOFTMAX &&
+        (l_next->op_type == OP_ARG_TOPK || l_next->op_type == OP_ARGMAX)) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // argmax/arg_topk not precedent by softmax: add combine before
+  // argmax/arg_topk
+  if (layer_idx == layers.size() - 1 &&
+      (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX)) {
+    auto const &l_prev = layers[layer_idx - 1];
+    if (l_prev->op_type == OP_SOFTMAX) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
 void FFModel::create_operators_from_layers() {
   std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
   // for (auto const &l : layers) {
@@ -3283,11 +3311,9 @@ void FFModel::create_operators_from_layers() {
       inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]);
     }
     Op *op = nullptr;
-    // add a combine before arg_topk / argmax
-    if (config.computationMode == COMP_MODE_INFERENCE &&
-        config.tensor_parallelism_degree > 1 &&
-        (layer_idx == layers.size() - 1 &&
-         (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX))) {
+    // add a combine before last arg_max / arg_topk or before second-to-last
+    // softmax
+    if (need_to_add_combine(layer_idx)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,

From 50fc13d20a8175720e031b785ec21b9a3248722d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 31 Jan 2024 23:53:23 +0000
Subject: [PATCH 139/198] fix replicate bwd

---
 src/parallel_ops/replicate.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index e4f19faa0a..2a3818e212 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -291,8 +291,12 @@ FutureMap Replicate::peft_bwd(FFModel const &ff,
   assert(numInputs == 1);
   assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
   DataType data_type = batch_inputs[0]->data_type;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+
+  // Warning: we need to use batch_inputs[0] here, instead of the usual
+  // batch_outputs[0]
+  parallel_is = batch_inputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view;
+
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
   IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID,

From f2c9a052ddbf4c469f2755c224d0d2faaa1509c3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 1 Feb 2024 04:58:32 +0000
Subject: [PATCH 140/198] fix

---
 src/runtime/inference_manager.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 066701f65c..66c47e6559 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -391,13 +391,6 @@ void InferenceManager::peft_bwd(FFModel *model,
   while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
     last_op -= 1;
   }
-  // Assert that the previous operator must be softmax
-  assert(model->operators[last_op]->op_type == OP_SOFTMAX ||
-         model->operators[last_op]->op_type == OP_FUSED);
-  if (model->operators[last_op]->op_type == OP_FUSED) {
-    FusedOp *fused_op = static_cast<FusedOp *>(model->operators[last_op]);
-    assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX);
-  }
   for (int o = last_op; o >= 0; o--) {
     Op *op = model->operators[o];
     if (op->op_type == OP_WEIGHT) {

From cd68f5d0cf6348410b718283517e2cfa947309ee Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 3 Feb 2024 16:35:54 +0000
Subject: [PATCH 141/198] let user control amount of peft memory

---
 include/flexflow/config.h                     |  1 +
 inference/python/incr_decoding.py             |  5 ++-
 inference/python/spec_infer.py                |  5 ++-
 python/flexflow/core/__init__.py              |  5 ++-
 python/flexflow/serve/__init__.py             | 32 +++++++++++++++++--
 src/runtime/model.cc                          | 19 +++++++++--
 src/runtime/model.cu                          |  4 +++
 .../python_test_configs/generate_configs.py   |  5 ++-
 8 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 2f112d4fc9..9bb230132a 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -154,6 +154,7 @@ class FFConfig {
   size_t offload_reserve_space_size;
   DataType quantization_type;
   // PEFT related fields
+  bool enable_peft;
   size_t peft_activation_reserve_space_size;
   size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 4a146ab503..ed57453762 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 4,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "inference_debugging": False,
             "fusion": True,
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index c9fb5cc7bb..b31ddf4604 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 2,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "inference_debugging": False,
             "fusion": True,
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index d7b1a595d2..2614518acf 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -87,7 +87,10 @@
     "offload": "-offload",
     "offload_reserve_space_size": "-offload-reserve-space-size",
     "use_4bit_quantization": "--4bit-quantization",
-    "use_8bit_quantization": "--8bit-quantization"
+    "use_8bit_quantization": "--8bit-quantization",
+    "enable_peft": "",
+    "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
+    "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }
 
 
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 274b431ad8..5805670ae0 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -44,6 +44,9 @@ def init(
     offload_reserve_space_size: Optional[int] = None,
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
+    enable_peft: Optional[bool] = None,
+    peft_activation_reserve_space_size: Optional[int] = None,
+    peft_weight_reserve_space_size: Optional[int] = None,
     profiling: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
     fusion: Optional[bool] = None,
@@ -68,9 +71,12 @@ def init(
     - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1
     - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1
     - offload: whether to enable offloading of the weights to CPU, defaults to False
-    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
+    - enable_peft: whether to enable the use of PEFT, defaults to False
+    - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
@@ -98,12 +104,18 @@ def init(
     :type pipeline_parallelism_degree: Optional[int], optional
     :param offload: whether to enable offloading of the weights to CPU, defaults to False
     :type offload: Optional[bool], optional
-    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     :type offload_reserve_space_size: Optional[int], optional
     :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     :type use_4bit_quantization: Optional[bool], optional
     :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     :type use_8bit_quantization: Optional[bool], optional
+    :param enable_peft: whether to enable the use of PEFT, defaults to False
+    :type enable_peft: Optional[bool], optional
+    :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    :type peft_activation_reserve_space_size: Optional[int], optional
+    :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
+    :type peft_weight_reserve_space_size: Optional[int], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -131,6 +143,9 @@ def init(
             offload_reserve_space_size is not None,
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
+            enable_peft is not None,
+            peft_activation_reserve_space_size is not None,
+            peft_weight_reserve_space_size is not None,
             profiling is not None,
             inference_debugging is not None,
             fusion is not None,
@@ -156,6 +171,9 @@ def init(
             "offload_reserve_space_size": offload_reserve_space_size,
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
+            "enable_peft": enable_peft,
+            "peft_activation_reserve_space_size": peft_activation_reserve_space_size,
+            "peft_weight_reserve_space_size": peft_weight_reserve_space_size,
             "profiling": profiling,
             "inference_debugging": inference_debugging,
             "fusion": fusion,
@@ -176,6 +194,8 @@ def init(
         "tensor_parallelism_degree",
         "pipeline_parallelism_degree",
         "offload_reserve_space_size",
+        "peft_activation_reserve_space_size",
+        "peft_weight_reserve_space_size",
     ]
     for param in positive_int_params:
         __check_positive_int(configs_dict, param)
@@ -194,11 +214,17 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024**2
+        configs_dict["offload_reserve_space_size"] = 8*1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
         configs_dict["use_8bit_quantization"] = False
+    if configs_dict.get("enable_peft", None) is None:
+        configs_dict["enable_peft"] = False
+    if configs_dict.get("peft_activation_reserve_space_size", None) is None:
+        configs_dict["peft_activation_reserve_space_size"] = 8*1024**3
+    if configs_dict.get("peft_weight_reserve_space_size", None) is None:
+        configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("inference_debugging", None) is None:
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 6d77730e47..e73415faaf 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1524,8 +1524,9 @@ FFRuntime::FFRuntime(FFConfig &config) {
     info.offload_reserve_space_size =
         config.cpu_offload ? config.offload_reserve_space_size : 0;
     info.peft_activation_reserve_space_size =
-        config.peft_activation_reserve_space_size;
-    info.peft_weight_reserve_space_size = config.peft_weight_reserve_space_size;
+        config.enable_peft ? config.peft_activation_reserve_space_size : 0;
+    info.peft_weight_reserve_space_size =
+        config.enable_peft ? config.peft_weight_reserve_space_size : 0;
     info.quantization_type = config.quantization_type;
     info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion;
     argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo)));
@@ -4062,6 +4063,7 @@ struct DefaultConfig {
   const static size_t offloadReserveSpaceSize =
       (size_t)8 * 1024 * 1024 * 1024; // 8 GB
   // PEFT related fields
+  const static bool enablePeft = false;
   const static size_t peftActivationReserveSpaceSize =
       (size_t)1 * 1024 * 1024 * 1024; // 1GB
   const static size_t peftWeightReserveSpaceSize =
@@ -4102,6 +4104,7 @@ FFConfig::FFConfig() {
   cpu_offload = DefaultConfig::cpuOffload;
   offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize;
   // PEFT related fields
+  enable_peft = DefaultConfig::enablePeft;
   peft_activation_reserve_space_size =
       DefaultConfig::peftActivationReserveSpaceSize;
   peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize;
@@ -4227,6 +4230,18 @@ void FFConfig::parse_args(char **argv, int argc) {
       quantization_type = DT_INT8;
       continue;
     }
+    if ((!strcmp(argv[i], "-enable-peft"))) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-activation-reserve-space-size")) {
+      peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) {
+      peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
+      continue;
+    }
     if ((!strcmp(argv[i], "--only-data-parallel"))) {
       only_data_parallel = true;
       continue;
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 754a6b18d7..80f4fdf143 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -164,6 +164,8 @@ FFHandler
     handle.peft_activation_allocator = new MemoryAllocator(gpu_mem);
     handle.peft_activation_allocator->create_legion_instance(
         workspaceInst, info->peft_activation_reserve_space_size);
+  } else {
+    handle.peft_activation_allocator = nullptr;
   }
 
   if (info->peft_weight_reserve_space_size > 0) {
@@ -188,6 +190,8 @@ FFHandler
     void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
     handle.peft_weight_allocator =
         new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size);
+  } else {
+    handle.peft_weight_allocator = nullptr;
   }
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index ebaadade32..b5cad16c65 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -14,9 +14,12 @@
     "tensor_parallelism_degree": 1,
     "pipeline_parallelism_degree": 4,
     "offload": False,
-    "offload_reserve_space_size": 1024**2,
+    "offload_reserve_space_size": 8 * 1024, # 8 GB
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
+    "enable_peft": False,
+    "peft_activation_reserve_space_size": 1024, # 1GB
+    "peft_weight_reserve_space_size": 1024, # 1GB
     "profiling": False,
     "inference_debugging": False,
     "fusion": True,

From 64a59d891ae3db48c8234af9bf46fadf48c4bd9b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 3 Feb 2024 17:17:56 +0000
Subject: [PATCH 142/198] only run peft_bwd if peft is enabled

---
 src/runtime/request_manager.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cbb21e03e0..2eebc070d6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2206,7 +2206,9 @@ GenerationResult RequestManager::generate_incr_decoding(
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second);
     FutureMap fm = im->inference(llm, 0, bcf);
-    im->peft_bwd(llm, 0, bcf);
+    if (llm->config.enable_peft) {
+      im->peft_bwd(llm, 0, bcf);
+    }
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(std::make_pair(bcf, irf));

From 32a07165cf1a68e8b15c8f591a66c397888712ec Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 6 Feb 2024 05:46:25 +0000
Subject: [PATCH 143/198] fix rms norm inference region reqs

---
 src/ops/residual_rms_norm.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index e549e5f6da..264c12f004 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -394,7 +394,7 @@ FutureMap
   assert(batch_outputs[0]->region == batch_inputs[0]->region);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_ONLY,
+                                                    READ_WRITE,
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
@@ -412,7 +412,7 @@ FutureMap
   launcher.add_field(2, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_WRITE,
+                                                    READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
   launcher.add_field(3, FID_DATA);
@@ -423,7 +423,7 @@ FutureMap
   regions[0](I/O): input1 / residual output
   regions[1](I): input2
   regions[2](O): output
-  regions[3](I/O): weight
+  regions[3](I): weight
 */
 void ResidualRMSNorm::inference_task(Task const *task,
                                      std::vector<PhysicalRegion> const &regions,

From a37b173adebb0f90767a16b4421a9de6a2ba42ee Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 7 Feb 2024 06:37:37 +0000
Subject: [PATCH 144/198] fix in-place fusion (part 1)

---
 src/ops/fused.cc     | 140 +++++++++++++++++++++++++++++++++++++++----
 src/runtime/model.cc |  15 ++++-
 2 files changed, 143 insertions(+), 12 deletions(-)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 8afd61aece..5f15e0b1cb 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/fused.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
@@ -87,12 +88,32 @@ FusedOp::FusedOp(FFModel &model, Op *op)
     // weights[i]->owner_idx = i;
     weight_data_types[i] = op->weights[i]->data_type;
   }
-  numOutputs = op->numOutputs;
-  for (int i = 0; i < numOutputs; i++) {
-    outputs[i] = op->outputs[i];
-    outputs[i]->owner_op = this;
-    outputs[i]->owner_idx = i;
-    output_data_types[i] = op->outputs[i]->data_type;
+  numOutputs = 0;
+  for (int i = 0; i < op->numOutputs; i++) {
+    bool found = false;
+    // Handle in-place outputs
+    for (int j = 0; j < numInputs; j++) {
+      if (inputs[j]->region == op->outputs[i]->region) {
+        // This output is one of the inputs
+        assert(!found);
+        assert(inputs[j]->region != LogicalRegion::NO_REGION);
+        op_output_source[i] = SOURCE_INPUT;
+        op_input_idx[i] = j;
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      // do nothing
+    } else {
+      outputs[numOutputs] = op->outputs[i];
+      output_data_types[numOutputs] = op->outputs[i]->data_type;
+      op_output_source[i] = SOURCE_OUTPUT;
+      op_output_idx[i] = numOutputs;
+      outputs[numOutputs]->owner_op = this;
+      outputs[numOutputs]->owner_idx = numOutputs;
+      numOutputs++;
+    }
   }
   numOperators = 1;
   op_num_inputs[0] = op->numInputs;
@@ -109,10 +130,53 @@ FusedOp::FusedOp(FFModel &model, Op *op)
     op_weight_source[i] = SOURCE_WEIGHT;
     op_weight_idx[i] = i;
   }
-  for (int i = 0; i < numOutputs; i++) {
-    op_output_source[i] = SOURCE_OUTPUT;
-    op_output_idx[i] = i;
-  }
+  // for (int i = 0; i < numOutputs; i++) {
+  //   op_output_source[i] = SOURCE_OUTPUT;
+  //   op_output_idx[i] = i;
+  // }
+#if 0
+  int input_offset = 0, weight_offset = 0, output_offset = 0;
+  printf("\nNew fused op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: "
+         "#inputs=%i, #outputs=%i, #weights=%i\n",
+         op->name,
+         get_operator_type_name(op->op_type).c_str(),
+         op->numInputs,
+         op->numOutputs,
+         op->numWeights,
+         numInputs,
+         numOutputs,
+         numWeights);
+  printf("op_input_idx:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_idx[i]);
+  }
+  printf("\n");
+  printf("op_input_source:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_source[i]);
+  }
+  printf("\n");
+  printf("op_output_idx:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_idx[i]);
+  }
+  printf("\n");
+  printf("op_output_source:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_source[i]);
+  }
+  printf("\n");
+  printf("op_weight_idx:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_idx[i]);
+  }
+  printf("\n");
+  printf("op_weight_source:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_source[i]);
+  }
+  printf("\n");
+#endif
 }
 
 bool FusedOp::add_operator(FFModel &model, Op *op) {
@@ -231,6 +295,18 @@ bool FusedOp::add_operator(FFModel &model, Op *op) {
         found = true;
         op_output_source[output_offset + i] = SOURCE_OUTPUT;
         op_output_idx[output_offset + i] = j;
+        break;
+      }
+    }
+    for (int j = 0; j < numInputs; j++) {
+      if (inputs[j]->region == op->outputs[i]->region) {
+        // This input is one of my inputs
+        assert(!found);
+        assert(inputs[j]->region != LogicalRegion::NO_REGION);
+        op_output_source[output_offset + i] = SOURCE_INPUT;
+        op_output_idx[output_offset + i] = j;
+        found = true;
+        break;
       }
     }
     if (found) {
@@ -271,6 +347,50 @@ bool FusedOp::add_operator(FFModel &model, Op *op) {
             "Reach to the #outputs limit during fusion.\n"
             "Consider increase MAX_NUM_OUTPUTS to allow more fusions.\n");
   }
+
+#if 0
+  printf("\nAdd op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: "
+         "#inputs=%i, #outputs=%i, #weights=%i\n",
+         op->name,
+         get_operator_type_name(op->op_type).c_str(),
+         op->numInputs,
+         op->numOutputs,
+         op->numWeights,
+         numInputs,
+         numOutputs,
+         numWeights);
+  printf("op_input_idx:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_idx[i]);
+  }
+  printf("\n");
+  printf("op_input_source:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_source[i]);
+  }
+  printf("\n");
+  printf("op_output_idx:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_idx[i]);
+  }
+  printf("\n");
+  printf("op_output_source:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_source[i]);
+  }
+  printf("\n");
+  printf("op_weight_idx:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_idx[i]);
+  }
+  printf("\n");
+  printf("op_weight_source:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_source[i]);
+  }
+  printf("\n");
+#endif
+
   return true;
 }
 
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index e73415faaf..0a76f84445 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -2967,8 +2967,19 @@ bool FFModel::apply_fusion(std::vector<Op *> const &operators,
                     found = k;
                   }
                 }
-                assert(found >= 0);
-                op->inputs[idx] = fused_op->outputs[found];
+                if (found >= 0) {
+                  op->inputs[idx] = fused_op->outputs[found];
+                } else {
+                  for (int k = 0; k < fused_op->numInputs; k++) {
+                    if (fused_op->inputs[k]->region ==
+                        op->inputs[idx]->region) {
+                      assert(found == -1);
+                      found = k;
+                    }
+                  }
+                  assert(found >= 0);
+                  op->inputs[idx] = fused_op->inputs[found];
+                }
               }
             }
             // Insert op

From 85f4d400142b29db74b89da749b765117bdf1b28 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 7 Feb 2024 06:51:53 +0000
Subject: [PATCH 145/198] fix inplace fusion (part 2)

---
 src/runtime/inference_manager.cc | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 66c47e6559..c7f2b6d5a9 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -664,9 +664,19 @@ void FFModel::compile_inference() {
           }
           for (int i = 0; i < fused->op_num_outputs[op]; i++) {
             int my_off = fused->op_output_idx[i + ooff];
-            assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT);
-            assert(fused->outputs[my_off]->region ==
-                   old_op->outputs[i]->region);
+            assert(
+                fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT ||
+                (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT &&
+                 (old_op->op_type == OP_RESIDUAL_LAYERNORM ||
+                  old_op->op_type == OP_RESIDUAL_RMS_NORM ||
+                  old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)));
+            if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) {
+              assert(fused->outputs[my_off]->region ==
+                     old_op->outputs[i]->region);
+            } else {
+              assert(fused->inputs[my_off]->region ==
+                     old_op->outputs[i]->region);
+            }
           }
           ioff += fused->op_num_inputs[op];
           woff += fused->op_num_weights[op];

From bb56a993879b7ab4edffeecae0467179fc0d5595 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 7 Feb 2024 07:17:43 +0000
Subject: [PATCH 146/198] fix

---
 src/ops/fused.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 5f15e0b1cb..7d0d829e51 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -482,8 +482,13 @@ void FusedOp::init_inference(FFModel const &ff,
     }
     for (int i = 0; i < op_num_outputs[op]; i++) {
       int my_off = op_output_idx[i + ooff];
-      assert(op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_batch_outputs.push_back(batch_outputs[my_off]);
+      if (op_output_source[i + ooff] == SOURCE_OUTPUT) {
+        my_batch_outputs.push_back(batch_outputs[my_off]);
+      } else if (op_output_source[i + ooff] == SOURCE_INPUT) {
+        my_batch_outputs.push_back(batch_inputs[my_off]);
+      } else {
+        assert(false);
+      }
     }
     ioff += op_num_inputs[op];
     ooff += op_num_outputs[op];

From 63f1fcedde381283349a201e6800f3cb6836bfc7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 7 Feb 2024 21:38:17 +0000
Subject: [PATCH 147/198] disable automatic inplace rms norm for now

---
 include/flexflow/flexflow_c.h                 |   1 +
 include/flexflow/model.h                      |   1 +
 .../ops/kernels/residual_rms_norm_kernels.h   |   1 +
 include/flexflow/ops/residual_rms_norm.h      |   2 +
 .../flexflow/ops/residual_rms_norm_params.h   |   1 +
 inference/models/llama.cc                     |   3 +
 python/flexflow/core/flexflow_cffi.py         |   7 +-
 src/c/flexflow_c.cc                           |  11 +-
 src/ops/fused.cc                              |   4 -
 src/ops/kernels/residual_rms_norm_kernels.cu  |   1 +
 src/ops/residual_rms_norm.cc                  | 205 +++++++++++++-----
 11 files changed, 179 insertions(+), 58 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 01a2818a2b..6ce5876fa1 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -565,6 +565,7 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index b3a6a85808..ecad8034bc 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -617,6 +617,7 @@ class FFModel {
                          Tensor *outputs,
                          float eps,
                          int dim,
+                         bool inplace_residual = false,
                          DataType data_type = DT_NONE,
                          char const *name = NULL);
   // Add a beam search top k layer
diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 691f8ef8c1..6eb5c0ae21 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -32,6 +32,7 @@ class ResidualRMSNormMeta : public OpMeta {
   void *rms_ptr;
   void *norm_ptr;
 
+  bool inplace_residual;
   int in_dim;
   int batch_size;
   int num_elements;
diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
index 2acc06841c..bf75cd573a 100644
--- a/include/flexflow/ops/residual_rms_norm.h
+++ b/include/flexflow/ops/residual_rms_norm.h
@@ -20,6 +20,7 @@ class ResidualRMSNorm : public Op {
                   const ParallelTensor _input2,
                   float _eps,
                   int dim,
+                  bool inplace_residual,
                   bool allocate_weights,
                   char const *name);
   ResidualRMSNorm(FFModel &model,
@@ -96,6 +97,7 @@ class ResidualRMSNorm : public Op {
   float eps;
   int effective_batch_size;
   int dim, data_dim;
+  bool inplace_residual;
 };
 } // namespace FlexFlow
 #endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H
diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h
index a4e4de59ab..8b8f666dc1 100644
--- a/include/flexflow/ops/residual_rms_norm_params.h
+++ b/include/flexflow/ops/residual_rms_norm_params.h
@@ -11,6 +11,7 @@ struct ResidualRMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &input) const;
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 9950d5b080..f4afb32e24 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -80,6 +80,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           token_att_norm,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
+          false, // inplace_residual
           DT_NONE,
           std::string("layers_" + std::to_string(i) + "_attention_norm")
               .c_str());
@@ -171,6 +172,7 @@ void LLAMA::create_llama_model(FFModel &ff,
         token_ff_norm,
         llama_config.rms_norm_eps,
         llama_config.hidden_size,
+        false, // inplace_residual
         DT_NONE,
         std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
     token = token_ff_norm[0];
@@ -234,6 +236,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                        final_rms_norm_output,
                        llama_config.rms_norm_eps,
                        llama_config.hidden_size,
+                       false, // inplace_residual
                        DT_NONE,
                        "norm");
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index de3f7e6929..f39e8f1e7e 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -3320,7 +3320,7 @@ def rms_norm(self, input, eps, dim, name=None):
         self.add_layer(OpType.RMS_NORM, name)
         return Tensor(handle, owner_op_type=OpType.RMS_NORM)
 
-    def residual_rms_norm(self, input1, input2, eps, dim, name=None):
+    def residual_rms_norm(self, input1, input2, eps, dim, inplace_residual=False, name=None):
         """Defines the Residual RMS Norm layer.
 
         :param input: the input 1 Tensor.
@@ -3338,11 +3338,14 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None):
         :param name: the name of the layer. Default is None.
         :type name: string
 
+        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
+        :type inplace_residual: bool
+
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
         handles_array = ffc().flexflow_model_add_residual_rms_norm(
-            self.handle, input1.handle, input2.handle, eps, dim, c_name
+            self.handle, input1.handle, input2.handle, eps, dim, inplace_residual, c_name
         )
         self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
         return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 8f5d197eb3..a7d081bd1a 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1469,13 +1469,20 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input1 = FFCObjectWrapper::unwrap(input1_);
   Tensor input2 = FFCObjectWrapper::unwrap(input2_);
   Tensor tensor_outputs[2];
-  handle->residual_rms_norm(
-      input1, input2, tensor_outputs, eps, dim, input1->data_type, name);
+  handle->residual_rms_norm(input1,
+                            input2,
+                            tensor_outputs,
+                            eps,
+                            dim,
+                            inplace_residual,
+                            input1->data_type,
+                            name);
   assert(tensor_outputs[0] != nullptr);
   assert(tensor_outputs[1] != nullptr);
   flexflow_tensor_t *tensor_outputs_wrapped =
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 7d0d829e51..bdb6d4d7a2 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -608,10 +608,6 @@ FutureMap FusedOp::inference(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   size_t machine_view_hash = view->hash();
-  // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig
-  // so we transfer the maximum of them
-  // size_t batch_config_size =
-  //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
   IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 664c1ed13b..969c6458a4 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -31,6 +31,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
     : OpMeta(handler, rms) {
   eps = rms->eps;
 
+  inplace_residual = rms->inplace_residual;
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
   num_elements = in_dim * batch_size;
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 264c12f004..cb511ef547 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -42,7 +42,8 @@ using namespace FlexFlow::Kernels::ResidualRMSNorm;
 
 bool operator==(ResidualRMSNormParams const &lhs,
                 ResidualRMSNormParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps;
+  return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps &&
+         lhs.dim == rhs.dim && lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool ResidualRMSNormParams::is_valid(
@@ -55,6 +56,7 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
+  params.inplace_residual = this->inplace_residual;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -66,6 +68,7 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                 Tensor *outputs,
                                 float eps,
                                 int dim,
+                                bool inplace_residual,
                                 DataType data_type,
                                 char const *name) {
   if (data_type == DT_NONE) {
@@ -106,6 +109,7 @@ void FFModel::residual_rms_norm(const Tensor input1,
 
   rm->add_float_property("eps", eps);
   rm->add_int_property("dim", dim);
+  rm->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(rm);
   outputs[0] = rm->outputs[0];
   outputs[1] = rm->outputs[1];
@@ -120,6 +124,8 @@ Op *ResidualRMSNorm::create_operator_from_layer(
   long long value;
   layer->get_int_property("dim", value);
   int dim = value;
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
 
   return new ResidualRMSNorm(model,
                              layer->layer_guid,
@@ -127,6 +133,7 @@ Op *ResidualRMSNorm::create_operator_from_layer(
                              inputs[1],
                              eps,
                              dim,
+                             inplace_residual,
                              false,
                              layer->name);
 }
@@ -143,6 +150,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       inputs.second,
                       params.eps,
                       params.dim,
+                      params.inplace_residual,
                       allocate_weights,
                       params.name) {}
 
@@ -157,6 +165,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       inputs.second,
                       other.eps,
                       other.dim,
+                      other.inplace_residual,
                       allocate_weights,
                       other.name) {}
 ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
@@ -165,6 +174,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
                                  const ParallelTensor _input2,
                                  float _eps,
                                  int dim,
+                                 bool _inplace_residual,
                                  bool allocate_weights,
                                  char const *name)
     : Op(model,
@@ -177,6 +187,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
          _input1,
          _input2) {
   eps = _eps;
+  inplace_residual = _inplace_residual;
   inputs[0] = _input1;
   inputs[1] = _input2;
   layer_guid = _layer_guid;
@@ -237,13 +248,17 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
 void ResidualRMSNorm::map_output_tensors(FFModel &ff) {
   assert(numOutputs == 2);
   assert(outputs[0]->get_volume() == inputs[0]->get_volume());
-  outputs[0]->parallel_is = inputs[0]->parallel_is;
-  outputs[0]->region = inputs[0]->region;
-  outputs[0]->part = inputs[0]->part;
-  outputs[0]->region_grad = inputs[0]->region_grad;
-  outputs[0]->part_grad = inputs[0]->part_grad;
-  // map output 1 to new region
-  ff.map_tensor(outputs[1], this);
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
 }
 
 void ResidualRMSNorm::init(FFModel const &ff) {
@@ -261,32 +276,44 @@ void ResidualRMSNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  assert(outputs[0]->part == inputs[0]->part);
-  assert(outputs[0]->region == inputs[0]->region);
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(outputs[0]->part == inputs[0]->part);
+    assert(outputs[0]->region == inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -314,32 +341,45 @@ void ResidualRMSNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  assert(batch_outputs[0]->part == batch_inputs[0]->part);
-  assert(batch_outputs[0]->region == batch_inputs[0]->region);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -390,32 +430,45 @@ FutureMap
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  assert(batch_outputs[0]->part == batch_inputs[0]->part);
-  assert(batch_outputs[0]->region == batch_inputs[0]->region);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -440,20 +493,68 @@ void ResidualRMSNorm::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // residual_output is mapped to the same region as the input
-  GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+
+  GenericTensorAccessorW residual_output, output;
+  GenericTensorAccessorR weight;
+  if (m->inplace_residual) {
+    // residual_output is mapped to the same region as the input
+    residual_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                       regions[0],
+                                                       task->regions[0],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    output = helperGetGenericTensorAccessorWO(m->output_type[1],
+                                              regions[2],
+                                              task->regions[2],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    weight = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+  } else {
+    residual_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                       regions[2],
+                                                       task->regions[2],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    output = helperGetGenericTensorAccessorWO(m->output_type[1],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    weight = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                              regions[4],
+                                              task->regions[4],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+  }
+
   inference_kernel_wrapper(
       m, bc, input1, input2, weight, residual_output, output);
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    ResidualRMSNorm::save_inference_tensors_to_file(
-        m, shard_id, bc, {input2}, {weight}, {residual_output, output});
+    if (m->inplace_residual) {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m, shard_id, bc, {input2}, {weight}, {residual_output, output});
+    } else {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input1, input2},
+          {weight},
+          {residual_output, output});
+    }
   }
 }
 
@@ -463,6 +564,7 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->eps);
   sez.serialize(this->dim);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -483,6 +585,8 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   dez.deserialize(eps);
   dez.deserialize(dim);
+  int inplace_residual;
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -491,6 +595,7 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   params.layer_guid = layer_guid;
   params.eps = eps;
   params.dim = dim;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   return ff.get_or_create_node<ResidualRMSNorm>({inputs[0], inputs[1]}, params);
 }

From 0d3aa7ecefea1b2aa2fb6e43b9a7ccf43c3811b4 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 8 Feb 2024 05:18:01 +0000
Subject: [PATCH 148/198] fix inf fusion inplace

---
 include/flexflow/flexflow_c.h                 |   2 +
 include/flexflow/model.h                      |   2 +
 .../ops/add_bias_residual_layer_norm.h        |   3 +
 .../ops/add_bias_residual_layer_norm_params.h |   1 +
 include/flexflow/ops/residual_layer_norm.h    |   3 +
 .../flexflow/ops/residual_layer_norm_params.h |   1 +
 inference/models/falcon.cc                    |   2 +
 inference/models/mpt.cc                       |   3 +
 inference/models/opt.cc                       |   3 +
 inference/models/starcoder.cc                 |   3 +
 python/flexflow/core/flexflow_cffi.py         |   8 +
 src/c/flexflow_c.cc                           |  10 +-
 src/ops/add_bias_residual_layer_norm.cc       | 238 ++++++++++++------
 src/ops/residual_layer_norm.cc                | 138 +++++++---
 src/ops/residual_layer_norm.cu                |   1 +
 src/ops/residual_rms_norm.cc                  |   1 +
 src/runtime/substitution.cc                   |   1 +
 17 files changed, 315 insertions(+), 105 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 6ce5876fa1..cd98c7f604 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -270,6 +270,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name);
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
@@ -281,6 +282,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name);
 
 flexflow_tensor_t
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index ecad8034bc..33dcb079b2 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -579,6 +579,7 @@ class FFModel {
                            bool elementwise_affine,
                            float eps,
                            bool use_bias = true,
+                           bool inplace_residual = false,
                            DataType data_type = DT_NONE,
                            char const *name = NULL);
   // Add a add_bias_residual_layer_norm layer
@@ -589,6 +590,7 @@ class FFModel {
                                     bool elementwise_affine,
                                     float eps,
                                     bool use_bias = true,
+                                    bool inplace_residual = false,
                                     DataType data_type = DT_NONE,
                                     char const *name = NULL);
   // Add a sigmoid_silu_multi layer
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index 550d56c47c..08b7404e14 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -24,6 +24,7 @@ class AddBiasResidualLayerNorm : public Op {
                            bool _elementwise_affine,
                            bool _use_bias,
                            float _eps,
+                           bool _inplace_residual,
                            bool allocate_weights,
                            char const *name);
   void map_output_tensors(FFModel &ff) override;
@@ -138,6 +139,7 @@ class AddBiasResidualLayerNorm : public Op {
   bool elementwise_affine, use_bias;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   std::vector<int> axes;
 };
 
@@ -152,6 +154,7 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
   bool elementwise_affine, use_bias;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
   // PEFT related fields
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
index 87fe2fb562..840f521b01 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams {
   bool elementwise_affine;
   float eps;
   bool use_bias;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index d924132452..a028097905 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -26,6 +26,7 @@ class ResidualLayerNorm : public Op {
                     bool _elementwise_affine,
                     bool _use_bias,
                     float _eps,
+                    bool inplace_residual,
                     bool allocate_weights,
                     char const *name);
   void map_output_tensors(FFModel &ff) override;
@@ -124,6 +125,7 @@ class ResidualLayerNorm : public Op {
   bool elementwise_affine, use_bias, use_two_residuals;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   std::vector<int> axes;
 };
 
@@ -138,6 +140,7 @@ class ResidualLayerNormMeta : public OpMeta {
   bool elementwise_affine, use_bias, use_two_residuals;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
   // PEFT related fields
diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h
index 949ae0c799..166d4b2b4e 100644
--- a/include/flexflow/ops/residual_layer_norm_params.h
+++ b/include/flexflow/ops/residual_layer_norm_params.h
@@ -13,6 +13,7 @@ struct ResidualLayerNormParams {
   float eps;
   bool use_bias;
   bool use_two_residuals;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(std::tuple<ParallelTensorShape,
                            ParallelTensorShape,
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index bfcec847b9..68815fab25 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -85,6 +85,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           true,
           falcon_config.layer_norm_epsilon,
           true,
+          false,
           DT_NONE,
           std::string("layers_" + std::to_string(i) + "_input_layernorm")
               .c_str());
@@ -212,6 +213,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                          true,
                          falcon_config.layer_norm_epsilon,
                          true,
+                         false,
                          DT_NONE,
                          "ln_f");
   Tensor ln_f = res_ln_outputs[1];
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index b074d332ed..75fe80933a 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -82,6 +82,7 @@ void MPT::create_mpt_model(FFModel &ff,
           true,
           1e-05,
           false,
+          false,
           DT_NONE,
           std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
       hidden_states = res_ln_outputs[0];
@@ -177,6 +178,7 @@ void MPT::create_mpt_model(FFModel &ff,
         true,
         1e-05,
         false,
+        false,
         DT_NONE,
         std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
     hidden_states = res_ln_outputs[0];
@@ -220,6 +222,7 @@ void MPT::create_mpt_model(FFModel &ff,
                          true,
                          1e-05,
                          false,
+                         false,
                          DT_NONE,
                          "transformer_norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index e0e940b186..409447e8f5 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -90,6 +90,7 @@ void OPT::create_opt_model(FFModel &ff,
         opt_config.layer_norm_elementwise_affine,
         1e-05,
         true,
+        false,
         DT_NONE,
         std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
             .c_str());
@@ -182,6 +183,7 @@ void OPT::create_opt_model(FFModel &ff,
                                     opt_config.layer_norm_elementwise_affine,
                                     1e-05,
                                     true,
+                                    false,
                                     DT_NONE,
                                     std::string("layers_" + std::to_string(i) +
                                                 "_add_bias_residual_layer_norm")
@@ -231,6 +233,7 @@ void OPT::create_opt_model(FFModel &ff,
                          opt_config.layer_norm_elementwise_affine,
                          1e-05,
                          true,
+                         false,
                          DT_NONE,
                          "final_layer_norm");
   Tensor all_final_norm = res_ln_outputs[1];
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index ba7b2cb43a..5fecf3da98 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -92,6 +92,7 @@ void STARCODER::create_starcoder_model(
         true,
         startcoder_config.layer_norm_epsilon,
         true,
+        false,
         DT_NONE,
         std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
@@ -140,6 +141,7 @@ void STARCODER::create_starcoder_model(
         true,
         startcoder_config.layer_norm_epsilon,
         true,
+        false,
         DT_NONE,
         std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
     residual = res_ln_outputs[0];
@@ -184,6 +186,7 @@ void STARCODER::create_starcoder_model(
                          true,
                          startcoder_config.layer_norm_epsilon,
                          true,
+                         false,
                          DT_NONE,
                          "transformer_ln_f");
   Tensor ln_f = res_ln_outputs[1];
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index f39e8f1e7e..c1ef14fb33 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1945,6 +1945,7 @@ def residual_layer_norm(
         elementwise_affine=True,
         eps=1e-5,
         use_bias=True,
+        inplace_residual=False,
         name=None,
     ):
         """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in 
@@ -1966,6 +1967,8 @@ def residual_layer_norm(
         :type eps: float, optional
         :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
         :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
         :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
         :type name: str, optional
         :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
@@ -1990,6 +1993,7 @@ def residual_layer_norm(
             elementwise_affine,
             eps,
             use_bias,
+            inplace_residual,
             c_name,
         )
         self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
@@ -2005,6 +2009,7 @@ def add_bias_residual_layer_norm(
         elementwise_affine=True,
         eps=1e-5,
         use_bias=True,
+        inplace_residual=False,
         name=None,
     ):
         """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, 
@@ -2023,6 +2028,8 @@ def add_bias_residual_layer_norm(
         :type eps: float, optional
         :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
         :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
         :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
         :type name: _type_, optional
         :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
@@ -2039,6 +2046,7 @@ def add_bias_residual_layer_norm(
             elementwise_affine,
             eps,
             use_bias,
+            inplace_residual,
             c_name,
         )
         self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index a7d081bd1a..a306b99397 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -649,6 +649,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   const Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -672,6 +673,7 @@ flexflow_tensor_t *
                               elementwise_affine,
                               eps,
                               use_bias,
+                              inplace_residual,
                               input->data_type,
                               name);
   assert(tensor_outputs[0] != nullptr);
@@ -679,7 +681,7 @@ flexflow_tensor_t *
   DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 "
               "%p, output0: %p, "
               "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps "
-              "%f, use_bias: %d, name %s",
+              "%f, use_bias: %d, inplace_residual: %d, name %s",
               input,
               residual1,
               residual2,
@@ -689,6 +691,7 @@ flexflow_tensor_t *
               elementwise_affine,
               eps,
               use_bias,
+              inplace_residual,
               name);
   flexflow_tensor_t *tensor_outputs_wrapped =
       (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t));
@@ -706,6 +709,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   const Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -722,13 +726,14 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
                                        elementwise_affine,
                                        eps,
                                        use_bias,
+                                       inplace_residual,
                                        input->data_type,
                                        name);
   assert(tensor_outputs[0] != nullptr);
   assert(tensor_outputs[1] != nullptr);
   DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, "
               "output1: %p, elementwise_affine %d, eps "
-              "%f, use_bias %d, name %s",
+              "%f, use_bias %d, inplace_residual: %d, name %s",
               input,
               residual,
               tensor_outputs[0],
@@ -736,6 +741,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
               elementwise_affine,
               eps,
               use_bias,
+              inplace_residual,
               name);
   flexflow_tensor_t *tensor_outputs_wrapped =
       (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t));
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 6b71279971..bdf30a803a 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -43,7 +43,8 @@ bool operator==(AddBiasResidualLayerNormParams const &lhs,
                 AddBiasResidualLayerNormParams const &rhs) {
   return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes &&
          lhs.elementwise_affine == rhs.elementwise_affine &&
-         lhs.use_bias == rhs.use_bias;
+         lhs.use_bias == rhs.use_bias &&
+         lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool AddBiasResidualLayerNormParams::is_valid(
@@ -58,6 +59,7 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const {
   params.elementwise_affine = this->elementwise_affine;
   params.eps = this->eps;
   params.use_bias = this->use_bias;
+  params.inplace_residual = this->inplace_residual;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -71,6 +73,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input,
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            DataType data_type,
                                            char const *name) {
   // In PyTorch, axes must be the sizes of the last axes.size() dimensions of
@@ -171,6 +174,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input,
   ln->add_int_property("use_bias", use_bias);
   ln->add_int_vector_property("axes", axes);
   ln->add_float_property("eps", eps);
+  ln->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(ln);
   outputs[0] = ln->outputs[0];
   outputs[1] = ln->outputs[1];
@@ -189,6 +193,8 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer(
   layer->get_int_vector_property("axes", axes);
   float eps;
   layer->get_float_property("eps", eps);
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
   return new AddBiasResidualLayerNorm(model,
                                       layer->layer_guid,
                                       inputs[0],
@@ -197,6 +203,7 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer(
                                       elementwise_affine,
                                       use_bias,
                                       eps,
+                                      inplace_residual,
                                       false, // allocate_weights
                                       layer->name);
 }
@@ -215,6 +222,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
                                params.elementwise_affine,
                                params.use_bias,
                                params.eps,
+                               params.inplace_residual,
                                allocate_weights,
                                params.name) {}
 
@@ -227,6 +235,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
     bool _elementwise_affine,
     bool _use_bias,
     float _eps,
+    bool _inplace_residual,
     bool allocate_weights,
     char const *name)
     : Op(model,
@@ -239,7 +248,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
          _input,
          _residual),
       elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes),
-      use_bias(_use_bias) {
+      use_bias(_use_bias), inplace_residual(_inplace_residual) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -348,44 +357,57 @@ void AddBiasResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  assert(batch_outputs[0]->part == batch_inputs[0]->part);
-  assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   // attn output
   // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(4, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -393,7 +415,7 @@ void AddBiasResidualLayerNorm::init_inference(
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(5, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -416,44 +438,56 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  assert(outputs[0]->part == inputs[0]->part);
-  assert(outputs[0]->region == inputs[0]->region);
+  if (inplace_residual) {
+    assert(outputs[0]->part == inputs[0]->part);
+    assert(outputs[0]->region == inputs[0]->region);
+  }
   // input: attn output
   // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(4, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -461,7 +495,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(5, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -535,37 +569,50 @@ FutureMap AddBiasResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  assert(batch_outputs[0]->part == batch_inputs[0]->part);
-  assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
   // input
   // added_output: input + attn bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // attn bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(1, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     // gamma
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
@@ -573,7 +620,7 @@ FutureMap AddBiasResidualLayerNorm::inference(
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(4, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
     if (use_bias) {
       // beta
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -581,7 +628,7 @@ FutureMap AddBiasResidualLayerNorm::inference(
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(5, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   return runtime->execute_index_space(ctx, launcher);
@@ -590,13 +637,17 @@ FutureMap AddBiasResidualLayerNorm::inference(
 void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) {
   assert(numOutputs == 2);
   assert(outputs[0]->get_volume() == inputs[0]->get_volume());
-  outputs[0]->parallel_is = inputs[0]->parallel_is;
-  outputs[0]->region = inputs[0]->region;
-  outputs[0]->part = inputs[0]->part;
-  outputs[0]->region_grad = inputs[0]->region_grad;
-  outputs[0]->part_grad = inputs[0]->part_grad;
-  // map output 1 to new region
-  ff.map_tensor(outputs[1], this);
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
 }
 
 /*
@@ -625,29 +676,69 @@ void AddBiasResidualLayerNorm::inference_task(
   assert(regions.size() ==
          4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
 
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-
+  int rid = 0, tid = 0, did = 0;
+  GenericTensorAccessorR input =
+      helperGetGenericTensorAccessorRO(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR attn_bias =
+      helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR residual =
+      helperGetGenericTensorAccessorRO(m->input_type[1],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW added_output;
+  if (m->inplace_residual) {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[0],
+                                                    task->regions[0],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  } else {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[rid++],
+                                                    task->regions[tid++],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  }
+  GenericTensorAccessorW output =
+      helperGetGenericTensorAccessorWO(m->output_type[1],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
   GenericTensorAccessorR gamma, beta;
 
   Domain in_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
   Domain attn_bias_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
   Domain residual_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
-  Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+  Domain added_out_domain;
+  if (m->inplace_residual) {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+  } else {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[did++].region.get_index_space());
+  }
   Domain out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[3].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
 
   Domain gamma_domain, beta_domain;
 
@@ -672,23 +763,23 @@ void AddBiasResidualLayerNorm::inference_task(
 
   if (m->elementwise_affine) {
     gamma = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                             regions[4],
-                                             task->regions[4],
+                                             regions[rid++],
+                                             task->regions[tid++],
                                              FID_DATA,
                                              ctx,
                                              runtime);
     gamma_domain = runtime->get_index_space_domain(
-        ctx, task->regions[4].region.get_index_space());
+        ctx, task->regions[did++].region.get_index_space());
 
     if (m->use_bias) {
       beta = helperGetGenericTensorAccessorRO(m->weight_type[2],
-                                              regions[5],
-                                              task->regions[5],
+                                              regions[rid++],
+                                              task->regions[tid++],
                                               FID_DATA,
                                               ctx,
                                               runtime);
       beta_domain = runtime->get_index_space_domain(
-          ctx, task->regions[5].region.get_index_space());
+          ctx, task->regions[did++].region.get_index_space());
       assert(gamma_domain == beta_domain);
     }
 
@@ -1031,6 +1122,7 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->elementwise_affine);
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -1047,6 +1139,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   bool elementwise_affine;
   bool use_bias;
   float eps;
+  bool inplace_residual;
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
@@ -1061,6 +1154,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(elementwise_affine);
   dez.deserialize(eps);
   dez.deserialize(use_bias);
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -1072,6 +1166,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   params.elementwise_affine = elementwise_affine;
   params.eps = eps;
   params.use_bias = use_bias;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   return ff.get_or_create_node<AddBiasResidualLayerNorm>({inputs[0], inputs[1]},
                                                          params);
@@ -1092,6 +1187,7 @@ size_t hash<FlexFlow::AddBiasResidualLayerNormParams>::operator()(
   }
   hash_combine(key, params.elementwise_affine);
   hash_combine(key, params.use_bias);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index dc302ce19c..9eea01cd81 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -44,7 +44,8 @@ bool operator==(ResidualLayerNormParams const &lhs,
   return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes &&
          lhs.elementwise_affine == rhs.elementwise_affine &&
          lhs.use_bias == rhs.use_bias &&
-         lhs.use_two_residuals == rhs.use_two_residuals;
+         lhs.use_two_residuals == rhs.use_two_residuals &&
+         lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool ResidualLayerNormParams::is_valid(
@@ -63,6 +64,7 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const {
   params.eps = this->eps;
   params.use_bias = this->use_bias;
   params.use_two_residuals = this->use_two_residuals;
+  params.inplace_residual = this->inplace_residual;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -78,6 +80,7 @@ void FFModel::residual_layer_norm(const Tensor input,
                                   bool elementwise_affine,
                                   float eps,
                                   bool use_bias,
+                                  bool inplace_residual,
                                   DataType data_type,
                                   char const *name) {
   // In PyTorch, axes must be the sizes of the last axes.size() dimensions of
@@ -178,6 +181,7 @@ void FFModel::residual_layer_norm(const Tensor input,
   ln->add_int_vector_property("axes", axes);
   ln->add_float_property("eps", eps);
   ln->add_int_property("use_two_residuals", use_two_residuals);
+  ln->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(ln);
   outputs[0] = ln->outputs[0];
   outputs[1] = ln->outputs[1];
@@ -198,6 +202,9 @@ Op *ResidualLayerNorm::create_operator_from_layer(
   layer->get_float_property("eps", eps);
   layer->get_int_property("use_two_residuals", value);
   bool use_two_residuals = (bool)value;
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
+
   return new ResidualLayerNorm(model,
                                layer->layer_guid,
                                inputs[0],
@@ -208,6 +215,7 @@ Op *ResidualLayerNorm::create_operator_from_layer(
                                elementwise_affine,
                                use_bias,
                                eps,
+                               inplace_residual,
                                false, // allocate_weights
                                layer->name);
 }
@@ -229,6 +237,7 @@ ResidualLayerNorm::ResidualLayerNorm(
                         params.elementwise_affine,
                         params.use_bias,
                         params.eps,
+                        params.inplace_residual,
                         allocate_weights,
                         params.name) {}
 
@@ -242,6 +251,7 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
                                      bool _elementwise_affine,
                                      bool _use_bias,
                                      float _eps,
+                                     bool _inplace_residual,
                                      bool allocate_weights,
                                      char const *name)
     : Op(model,
@@ -255,7 +265,8 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
          _residual1,
          _use_two_residuals ? _residual2 : nullptr),
       elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes),
-      use_bias(_use_bias), use_two_residuals(_use_two_residuals) {
+      use_bias(_use_bias), use_two_residuals(_use_two_residuals),
+      inplace_residual(_inplace_residual) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -328,13 +339,17 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
 void ResidualLayerNorm::map_output_tensors(FFModel &ff) {
   assert(numOutputs == 2);
   assert(outputs[0]->get_volume() == inputs[0]->get_volume());
-  outputs[0]->parallel_is = inputs[0]->parallel_is;
-  outputs[0]->region = inputs[0]->region;
-  outputs[0]->part = inputs[0]->part;
-  outputs[0]->region_grad = inputs[0]->region_grad;
-  outputs[0]->part_grad = inputs[0]->part_grad;
-  // map output 1 to new region
-  ff.map_tensor(outputs[1], this);
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
 }
 
 void ResidualLayerNorm::init_inference(
@@ -358,16 +373,19 @@ void ResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  assert(batch_outputs[0]->part == batch_inputs[0]->part);
-  assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   int field_id = 0;
   // input
   // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
@@ -385,6 +403,15 @@ void ResidualLayerNorm::init_inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -434,11 +461,12 @@ void ResidualLayerNorm::init(FFModel const &ff) {
   int field_id = 0;
   // input
   // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
@@ -456,6 +484,14 @@ void ResidualLayerNorm::init(FFModel const &ff) {
                                                       inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
@@ -876,16 +912,19 @@ FutureMap ResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  assert(batch_outputs[0]->part == batch_inputs[0]->part);
-  assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   int field_id = 0;
   // input
   // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
@@ -903,6 +942,15 @@ FutureMap ResidualLayerNorm::inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -972,8 +1020,23 @@ void ResidualLayerNorm::inference_task(
                                          ctx,
                                          runtime);
   }
-  GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW added_output;
+  if (m->inplace_residual) {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[0],
+                                                    task->regions[0],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  } else {
+    added_output =
+        helperGetGenericTensorAccessorWO(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(m->output_type[1],
                                        regions[region_idx++],
@@ -1011,8 +1074,14 @@ void ResidualLayerNorm::inference_task(
     assert(in_domain.get_volume() == residual2_domain.get_volume());
     assert(residual2_domain == in_domain);
   }
-  Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
+  Domain added_out_domain;
+  if (m->inplace_residual) {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+  } else {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[task_region_idx++].region.get_index_space());
+  }
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[task_region_idx++].region.get_index_space());
   Domain gamma_domain, beta_domain;
@@ -1091,6 +1160,7 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
   sez.serialize(this->use_two_residuals);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -1106,6 +1176,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   bool elementwise_affine;
   bool use_bias;
   bool use_two_residuals;
+  bool inplace_residual;
   float eps;
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
@@ -1122,6 +1193,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(eps);
   dez.deserialize(use_bias);
   dez.deserialize(use_two_residuals);
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -1139,6 +1211,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   params.eps = eps;
   params.use_bias = use_bias;
   params.use_two_residuals = use_two_residuals;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   if (use_two_residuals) {
     return ff.get_or_create_node<ResidualLayerNorm>(
@@ -1165,6 +1238,7 @@ size_t hash<FlexFlow::ResidualLayerNormParams>::operator()(
   hash_combine(key, params.elementwise_affine);
   hash_combine(key, params.use_bias);
   hash_combine(key, params.use_two_residuals);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 0ba462cde5..5e736cd6e8 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -36,6 +36,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
   eps = ln->eps;
+  inplace_residual = ln->inplace_residual;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index cb511ef547..ff6729b925 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -840,6 +840,7 @@ size_t hash<FlexFlow::ResidualRMSNormParams>::operator()(
   hash_combine(key, params.eps);
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.dim);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index e8b986582f..8c08c1cca0 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -3814,6 +3814,7 @@ bool FFModel::convert_graph_to_operators(
                                               abr_ln->elementwise_affine,
                                               abr_ln->use_bias,
                                               abr_ln->eps,
+                                              abr_ln->inplace_residual,
                                               true,
                                               NULL);
         break;

From b658061c9a953d09a99ac24cf479c4966dfe1eef Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 9 Feb 2024 15:25:43 -0500
Subject: [PATCH 149/198] fix rest input grads for peft without inplace
 residuals

---
 src/runtime/inference_manager.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index c7f2b6d5a9..229d1785bf 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -613,6 +613,11 @@ void FFModel::compile_inference() {
         // We should not reset input grads since other operators have already
         // saved gradients into the region
         op->reset_input_grads[i] = false;
+      } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || op->op_type == OP_RESIDUAL_RMS_NORM || op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) {
+        if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) {
+          reset_inputs.insert(op->inputs[0]->region);
+          op->reset_input_grads[0] = false;
+        }
       } else {
         reset_inputs.insert(op->inputs[i]->region);
       }

From 3255fe4c260d72271d1f00d2a391c48f511c75fd Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 9 Feb 2024 22:00:35 +0000
Subject: [PATCH 150/198] fix

---
 src/ops/residual_rms_norm.cc     | 4 ++--
 src/runtime/inference_manager.cc | 4 +++-
 src/runtime/request_manager.cc   | 9 +++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index ff6729b925..28fafcf224 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -482,13 +482,13 @@ void ResidualRMSNorm::inference_task(Task const *task,
                                      std::vector<PhysicalRegion> const &regions,
                                      Context ctx,
                                      Runtime *runtime) {
-  assert(task->regions.size() == 4);
-  assert(regions.size() == 4);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
   }
   ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
+  assert(task->regions.size() == 5 - m->inplace_residual);
+  assert(regions.size() == 5 - m->inplace_residual);
   GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 229d1785bf..15d02edbbb 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -613,7 +613,9 @@ void FFModel::compile_inference() {
         // We should not reset input grads since other operators have already
         // saved gradients into the region
         op->reset_input_grads[i] = false;
-      } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || op->op_type == OP_RESIDUAL_RMS_NORM || op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) {
+      } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM ||
+                            op->op_type == OP_RESIDUAL_RMS_NORM ||
+                            op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) {
         if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) {
           reset_inputs.insert(op->inputs[0]->region);
           op->reset_input_grads[0] = false;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2eebc070d6..20496b7d84 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2077,6 +2077,15 @@ bool is_peft_operator_type(OperatorType type) {
 
 PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
                                          LoraLinearConfig const mlp_second) {
+  if (!(mlp_first == LoraLinearConfig::DefaultConfig &&
+        mlp_second == LoraLinearConfig::DefaultConfig)) {
+    if (!config.enable_peft) {
+      fprintf(stderr,
+              "Error: trying to register PEFT model, but peft mode is not "
+              "enabled.\n");
+      assert(false);
+    }
+  }
   PEFTModelID peft_model_id(peft_model_global_guid++);
   InferenceManager *im = InferenceManager::get_inference_manager();
   std::vector<Op *> peft_operators;

From ec2002e98a40bc7814ba38ba5dbc0ba87c9727e3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 15 Feb 2024 22:16:52 +0000
Subject: [PATCH 151/198] fix

---
 src/runtime/inference_manager.cc         | 2 +-
 tests/peft/alignment/align_test_utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 15d02edbbb..e480e74baa 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -617,9 +617,9 @@ void FFModel::compile_inference() {
                             op->op_type == OP_RESIDUAL_RMS_NORM ||
                             op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) {
         if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) {
-          reset_inputs.insert(op->inputs[0]->region);
           op->reset_input_grads[0] = false;
         }
+        reset_inputs.insert(op->inputs[i]->region);
       } else {
         reset_inputs.insert(op->inputs[i]->region);
       }
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index b0cb5fe428..dbe7a0be40 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -1,8 +1,8 @@
 import os, re, torch
 import numpy as np
 abs_dirname = os.path.dirname(os.path.abspath(__file__))
-hf_path = os.path.join(abs_dirname, "hf_peft_tensors")
-ff_path = os.path.join(os.path.dirname(os.path.dirname(abs_dirname)), "build", "inference_tensors")
+hf_path = os.path.join(os.path.dirname(abs_dirname), "hf_peft_tensors")
+ff_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(abs_dirname))), "build", "inference_tensors")
 def print_unique_files_list(dirname):
     files_list = os.listdir(dirname)
     for f in sorted(files_list):

From 098e88016fe8557da498ae876701f96df46ae966 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 16 Feb 2024 02:48:11 +0000
Subject: [PATCH 152/198] fix residual rms

---
 .../ops/kernels/residual_rms_norm_kernels.h   |   7 +-
 src/ops/fused.cu                              |   3 +-
 src/ops/kernels/residual_rms_norm_kernels.cu  |  59 ++++----
 src/ops/residual_rms_norm.cc                  | 138 ++++++++++++------
 4 files changed, 134 insertions(+), 73 deletions(-)

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 6eb5c0ae21..dfc9937cc3 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -66,9 +66,10 @@ void backward_kernel_wrapper(
     GenericTensorAccessorW const &weight_grad);
 void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
                              BatchConfig const *bc,
-                             GenericTensorAccessorR const &output_grad,
-                             GenericTensorAccessorW const &residual_input0_grad,
-                             GenericTensorAccessorW const &residual_input1_grad,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
                              GenericTensorAccessorR const &weight);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 55892ab7e9..c589f6a5be 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1026,9 +1026,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
             m,
             bc,
-            my_output_grad_accessor[1],
             my_input_grad_accessor[0],
             my_input_grad_accessor[1],
+            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
             my_weight_accessor[0]);
         break;
       }
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 969c6458a4..4b92e70787 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -332,6 +332,7 @@ __global__ void ComputeInternalGradientsCUDAKernel(
 
 template <typename T>
 __global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dX1_residual,
                                           T const *dY,
                                           T const *X,
                                           T const *gamma,
@@ -351,7 +352,7 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N,
     if (reset_input_grad1) {
       dX1[index] = static_cast<T>(dX_val);
     } else {
-      dX1[index] += static_cast<T>(dX_val);
+      dX1[index] = dX1_residual[index] + static_cast<T>(dX_val);
     }
     if (reset_input_grad2) {
       dX2[index] = static_cast<T>(dX1[index]);
@@ -399,6 +400,7 @@ void backward_kernel(ResidualRMSNormMeta const *m,
 
   RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
       N,
+      nullptr,
       output_grad_ptr,
       residual_output_rms_input_ptr,
       weight_ptr,
@@ -421,9 +423,10 @@ void backward_kernel(ResidualRMSNormMeta const *m,
 template <typename T>
 void peft_bwd_kernel(ResidualRMSNormMeta const *m,
                      BatchConfig const *bc,
-                     T const *output_grad_ptr,
-                     T *residual_input0_grad_ptr,
-                     T *residual_input1_grad_ptr,
+                     T const *output_grad_0_ptr,
+                     T const *output_grad_1_ptr,
+                     T *input_grad_0_ptr,
+                     T *input_grad_1_ptr,
                      T const *weight_ptr,
                      cudaStream_t stream) {
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
@@ -448,7 +451,7 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m,
     ComputeInternalGradientsCUDAKernel<T>
         <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
             N,
-            output_grad_ptr,
+            output_grad_1_ptr,
             residual_output_rms_input_ptr,
             weight_ptr,
             static_cast<T *>(m->rms_ptr),
@@ -457,13 +460,14 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m,
     RMSNormBackwardCUDAKernel<T>
         <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
             N,
-            output_grad_ptr,
+            output_grad_0_ptr,
+            output_grad_1_ptr,
             residual_output_rms_input_ptr,
             weight_ptr,
             static_cast<T *>(m->rms_ptr),
             static_cast<T *>(m->norm_ptr),
-            residual_input0_grad_ptr,
-            residual_input1_grad_ptr,
+            input_grad_0_ptr,
+            input_grad_1_ptr,
             m->reset_input_grads[0],
             m->reset_input_grads[1]);
   }
@@ -532,17 +536,12 @@ void backward_kernel_wrapper(
   }
 }
 
-/*
-  regions[0](I): RMS output_grad
-  regions[1](I/O): Residual input 0 grad
-  regions[2](I/O): Residual input 1 grad
-  regions[3](I): weight
-*/
 void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
                              BatchConfig const *bc,
-                             GenericTensorAccessorR const &output_grad,
-                             GenericTensorAccessorW const &residual_input0_grad,
-                             GenericTensorAccessorW const &residual_input1_grad,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
                              GenericTensorAccessorR const &weight) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -552,24 +551,28 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  assert(output_grad.data_type == residual_input0_grad.data_type);
-  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
-  assert(residual_input1_grad.data_type == weight.data_type);
+  assert(output_grad_1.data_type == input_grad_0.data_type);
+  assert(input_grad_0.data_type == input_grad_1.data_type);
+  assert(input_grad_1.data_type == weight.data_type);
 
-  if (output_grad.data_type == DT_HALF) {
+  if (output_grad_1.data_type == DT_HALF) {
     peft_bwd_kernel(m,
                     bc,
-                    output_grad.get_half_ptr(),
-                    residual_input0_grad.get_half_ptr(),
-                    residual_input1_grad.get_half_ptr(),
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_half_ptr(),
+                    output_grad_1.get_half_ptr(),
+                    input_grad_0.get_half_ptr(),
+                    input_grad_1.get_half_ptr(),
                     weight.get_half_ptr(),
                     stream);
-  } else if (output_grad.data_type == DT_FLOAT) {
+  } else if (output_grad_1.data_type == DT_FLOAT) {
     peft_bwd_kernel(m,
                     bc,
-                    output_grad.get_float_ptr(),
-                    residual_input0_grad.get_float_ptr(),
-                    residual_input1_grad.get_float_ptr(),
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_float_ptr(),
+                    output_grad_1.get_float_ptr(),
+                    input_grad_0.get_float_ptr(),
+                    input_grad_1.get_float_ptr(),
                     weight.get_float_ptr(),
                     stream);
   } else {
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 28fafcf224..c0e517f5c4 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -732,37 +732,47 @@ Legion::FutureMap
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  // regions[0](I): RMS output_grad
-  launcher.add_region_requirement(
-      RegionRequirement(batch_outputs[1]->part_grad,
-                        0 /*projection id*/,
-                        READ_WRITE,
-                        EXCLUSIVE,
-                        batch_outputs[1]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  // regions[2](I/O): residual input grad 0
-  launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[0]->part_grad,
-                        0 /*projection id*/,
-                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
-                        EXCLUSIVE,
-                        batch_inputs[0]->region_grad));
-  launcher.add_field(1, FID_DATA);
-  // regions[3](I/O): residual input grad 1
+  int fid = 0;
+  // residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(
+      batch_inputs[0]->part_grad,
+      0 /*projection id*/,
+      inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY,
+      EXCLUSIVE,
+      batch_inputs[0]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  // residual input grad 1
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
                         reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
-  launcher.add_field(2, FID_DATA);
-  // regions[4](I): gamma
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual && !reset_input_grads[0]) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(fid++, FID_DATA);
+  }
+  // RMS output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  // gamma
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -776,45 +786,91 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
                                     std::vector<PhysicalRegion> const &regions,
                                     Context ctx,
                                     Runtime *runtime) {
-  assert(task->regions.size() == 4);
-  assert(regions.size() == 4);
   ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
+  int expected_regions =
+      (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5;
+  assert(task->regions.size() == expected_regions);
+  assert(regions.size() == expected_regions);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_active_peft_tokens() == 0) {
     return;
   }
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW residual_input0_grad =
+
+  int rid = 0, t_rid = 0;
+  GenericTensorAccessorW input_grad_0 =
       helperGetGenericTensorAccessorRW(m->input_type[0],
-                                       regions[1],
-                                       task->regions[1],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorW residual_input1_grad =
+  GenericTensorAccessorW input_grad_1 =
       helperGetGenericTensorAccessorRW(m->input_type[0],
-                                       regions[2],
-                                       task->regions[2],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+
+  GenericTensorAccessorR output_grad_0;
+  if (!m->reset_input_grads[0]) {
+    if (m->inplace_residual) {
+      // mapped to input 0
+      output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                                       regions[0],
+                                                       task->regions[0],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    } else {
+      output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                                       regions[rid++],
+                                                       task->regions[t_rid++],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    }
+  }
+  GenericTensorAccessorR output_grad_1 =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight =
+      helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
   peft_bwd_kernel_wrapper(
-      m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
+      m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    ResidualRMSNorm::save_inference_tensors_to_file(
-        m,
-        shard_id,
-        bc,
-        {residual_input0_grad, residual_input1_grad},
-        {weight},
-        {output_grad},
-        false);
+    if (!m->reset_input_grads[0]) {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input_grad_0, input_grad_1},
+          {weight},
+          {output_grad_0, output_grad_1},
+          false);
+    } else {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input_grad_0, input_grad_1},
+          {weight},
+          {output_grad_1},
+          false);
+    }
   }
 }
 

From 5688e16b374c6cd1b95433879ec68c9b002248d7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 16 Feb 2024 05:02:10 +0000
Subject: [PATCH 153/198] fix

---
 src/ops/fused.cc |  2 +-
 src/ops/fused.cu | 43 ++++++++++++++++++++++++-------------------
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index bdb6d4d7a2..4c934f8612 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -652,7 +652,7 @@ FutureMap FusedOp::inference(FFModel const &ff,
   offset += numOutputs;
   // add softmax output grad
   if (operators[numOperators - 1]->op_type == OP_SOFTMAX) {
-    printf("operator %i is last SOFTMAX! adding output %i\n",
+    printf("operator %i is last SOFTMAX! adding grad for output %i\n",
            numOperators - 1,
            numOutputs - 1);
     assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index c589f6a5be..b89b6909cf 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -44,6 +44,7 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "flexflow/ffconst_utils.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -161,6 +162,9 @@ __host__ void
 
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
+#if 0
+    std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl;
+#endif
     // Domain my_id[MAX_NUM_INPUTS];
     // Domain my_wd[MAX_NUM_WEIGHTS];
     // Domain my_od[MAX_NUM_OUTPUTS];
@@ -172,9 +176,15 @@ __host__ void
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
         // my_id[i] = input_domain[my_off];
         my_input_accessor[i] = input_accessor[my_off];
+#if 0
+        printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off);
+#endif
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
         // my_id[i] = output_domain[my_off];
         my_input_accessor[i] = output_accessor[my_off];
+#if 0
+        printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off);
+#endif
       } else {
         assert(false);
       }
@@ -191,6 +201,9 @@ __host__ void
       // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
       // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
       my_output_accessor[i] = output_accessor[my_off];
+#if 0
+      printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off);
+#endif
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -439,13 +452,14 @@ __host__ void
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
+        ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::inference_kernel_wrapper(m,
+                                                            bc,
+                                                            my_input_accessor[0],
+                                                            my_input_accessor[1],
+                                                            my_weight_accessor[0],
+                                                            my_output_accessor[0],
+                                                            my_output_accessor[1]);
         break;
       }
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -668,22 +682,13 @@ __host__ void
       std::vector<GenericTensorAccessorR> weight_accessors_to_save;
       std::vector<GenericTensorAccessorR> output_accessors_to_save;
       for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          input_accessors_to_save.push_back(input_accessor[my_off]);
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          input_accessors_to_save.push_back(output_accessor[my_off]);
-        } else {
-          assert(false);
-        }
+        input_accessors_to_save.push_back(my_input_accessor[i]);
       }
       for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(
-            weight_accessor[fused->op_weight_idx[i + woff]]);
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
       }
       for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        output_accessors_to_save.push_back(output_accessor[i + ooff]);
+        output_accessors_to_save.push_back(my_output_accessor[i]);
       }
       assert(task->index_point.get_dim() == 1);
       int shard_id = task->index_point.point_data[0];

From 9225e0c966cc5156ee6967c25be62c59bb1c2b4b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 16 Feb 2024 05:39:37 +0000
Subject: [PATCH 154/198] fix

---
 src/ops/fused.cu | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index b89b6909cf..33b0aeca19 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -441,11 +441,12 @@ __host__ void
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
+        RMSNormMeta *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::inference_kernel_wrapper(m,
+                                                   bc,
+                                                    my_input_accessor[0],
+                                                    my_weight_accessor[0],
+                                                    my_output_accessor[0]);
         break;
       }
       case OP_RESIDUAL_RMS_NORM: {
@@ -805,6 +806,9 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
   }
 
   for (int op = fused->numOperators - 1; op >= 0; op--) {
+#if 0
+    std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl;
+#endif
     ioff -= fused->op_num_inputs[op];
     woff -= fused->op_num_weights[op];
     ooff -= fused->op_num_outputs[op];
@@ -813,9 +817,15 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
         // my_id[i] = input_domain[my_off];
         my_input_grad_accessor[i] = input_grad_accessor[my_off];
+#if 0
+        printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off);
+#endif
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
         // my_id[i] = output_domain[my_off];
         my_input_grad_accessor[i] = output_grad_accessor[my_off];
+#if 0
+        printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off);
+#endif
       } else {
         assert(false);
       }
@@ -832,6 +842,9 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
       // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
       // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
       my_output_grad_accessor[i] = output_grad_accessor[my_off];
+#if 0
+      printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off);
+#endif
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {

From e12bff14f266d4b6ee1d868c3e883c76b916079a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 02:19:11 +0000
Subject: [PATCH 155/198] enable inf debugging in fusion bwd

---
 src/ops/fused.cu | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 33b0aeca19..965e08d6f9 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1195,6 +1195,29 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(false && "Fusion currently does not support type");
       }
     }
+    if (metas->meta[op]->inference_debugging) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_grad_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_grad_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save,
+                                              false);
+    }
   }
 }
 

From ed9afb7c0e1bff9f4966ff0afbe6c3b55e2e9cf5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 02:25:47 +0000
Subject: [PATCH 156/198] hack to silence warning in fused bwd

---
 src/ops/fused.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 4c934f8612..a81bf716bd 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -652,9 +652,9 @@ FutureMap FusedOp::inference(FFModel const &ff,
   offset += numOutputs;
   // add softmax output grad
   if (operators[numOperators - 1]->op_type == OP_SOFTMAX) {
-    printf("operator %i is last SOFTMAX! adding grad for output %i\n",
-           numOperators - 1,
-           numOutputs - 1);
+    // printf("operator %i is last SOFTMAX! adding grad for output %i\n",
+    //        numOperators - 1,
+    //        numOutputs - 1);
     assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION);
     launcher.add_region_requirement(
         RegionRequirement(batch_outputs[numOutputs - 1]->part_grad,
@@ -700,7 +700,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
     launcher.add_region_requirement(
         RegionRequirement(batch_inputs[i]->part_grad,
                           0 /*projection id*/,
-                          READ_WRITE,
+                          WRITE_ONLY,
                           EXCLUSIVE,
                           batch_inputs[i]->region_grad));
     launcher.add_field(offset + i, FID_DATA);
@@ -721,7 +721,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
     launcher.add_region_requirement(
         RegionRequirement(batch_outputs[i]->part_grad,
                           0 /*projection id*/,
-                          READ_WRITE,
+                          i == numOutputs -1 ? READ_WRITE : WRITE_ONLY,
                           EXCLUSIVE,
                           batch_outputs[i]->region_grad));
     launcher.add_field(offset + i, FID_DATA);

From 96d0e9b00fc1e33ec34e682f8b231b098f52bffc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 02:43:25 +0000
Subject: [PATCH 157/198] fix

---
 src/ops/fused.cc |  2 +-
 src/ops/fused.cu | 35 ++++++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index a81bf716bd..d5f1ace86d 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -721,7 +721,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff,
     launcher.add_region_requirement(
         RegionRequirement(batch_outputs[i]->part_grad,
                           0 /*projection id*/,
-                          i == numOutputs -1 ? READ_WRITE : WRITE_ONLY,
+                          i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY,
                           EXCLUSIVE,
                           batch_outputs[i]->region_grad));
     launcher.add_field(offset + i, FID_DATA);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 965e08d6f9..99d9e3410f 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -44,7 +45,6 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
-#include "flexflow/ffconst_utils.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -444,9 +444,9 @@ __host__ void
         RMSNormMeta *m = (RMSNormMeta *)metas->meta[op];
         Kernels::RMSNorm::inference_kernel_wrapper(m,
                                                    bc,
-                                                    my_input_accessor[0],
-                                                    my_weight_accessor[0],
-                                                    my_output_accessor[0]);
+                                                   my_input_accessor[0],
+                                                   my_weight_accessor[0],
+                                                   my_output_accessor[0]);
         break;
       }
       case OP_RESIDUAL_RMS_NORM: {
@@ -454,13 +454,14 @@ __host__ void
         assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 2);
         ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::inference_kernel_wrapper(m,
-                                                            bc,
-                                                            my_input_accessor[0],
-                                                            my_input_accessor[1],
-                                                            my_weight_accessor[0],
-                                                            my_output_accessor[0],
-                                                            my_output_accessor[1]);
+        Kernels::ResidualRMSNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_input_accessor[1],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            my_output_accessor[1]);
         break;
       }
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -678,7 +679,11 @@ __host__ void
         assert(false && "Fusion currently does not support type");
       }
     }
-    if (metas->meta[op]->inference_debugging) {
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
       std::vector<GenericTensorAccessorR> input_accessors_to_save;
       std::vector<GenericTensorAccessorR> weight_accessors_to_save;
       std::vector<GenericTensorAccessorR> output_accessors_to_save;
@@ -1195,7 +1200,11 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         assert(false && "Fusion currently does not support type");
       }
     }
-    if (metas->meta[op]->inference_debugging) {
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
       std::vector<GenericTensorAccessorR> input_accessors_to_save;
       std::vector<GenericTensorAccessorR> weight_accessors_to_save;
       std::vector<GenericTensorAccessorR> output_accessors_to_save;

From 2cbc0b717bd5063627595059ece7c357f74cba23 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 04:31:05 +0000
Subject: [PATCH 158/198] fix

---
 src/runtime/model.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 603e87a937..10ce05ca1e 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3798,9 +3798,16 @@ bool FFModel::check_operators_integrity(
         }
         for (int i = 0; i < fused->op_num_outputs[op]; i++) {
           int my_off = fused->op_output_idx[i + ooff];
-          assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT);
-          assert(FusedOp::use_same_regions(
-              fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+          assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT ||
+                (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT &&
+                 (old_op->op_type == OP_RESIDUAL_LAYERNORM ||
+                  old_op->op_type == OP_RESIDUAL_RMS_NORM ||
+                  old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)));
+          if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) {
+            assert(FusedOp::use_same_regions(fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+          } else {
+            assert(FusedOp::use_same_regions(fused->inputs[my_off], old_op->outputs[i], pt_mapping));
+          }
         }
         ioff += fused->op_num_inputs[op];
         woff += fused->op_num_weights[op];

From 36cb2b39d1ff573e2b8f60dcc81deb1b4a4378f0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 05:39:13 +0000
Subject: [PATCH 159/198] fix build

---
 inference/incr_decoding/incr_decoding.cc |  2 +-
 src/c/flexflow_c.cc                      |  2 +-
 src/ops/arg_topk.cc                      |  5 +--
 src/ops/inc_multihead_self_attention.cu  |  1 +
 src/ops/sigmoid_silu_multi.cc            |  4 ---
 src/runtime/inference_manager.cc         |  4 +--
 src/runtime/model.cc                     | 14 +++++----
 src/runtime/request_manager.cc           | 39 ++++++++++++++----------
 8 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 7f2ea21148..d376c3e39c 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -270,7 +270,7 @@ void FlexFlow::top_level_task(Task const *task,
           : model.register_peft_model(
                 LoraLinearConfig::DefaultConfig /*mlp_first*/,
                 mlp_second /*mlp_second*/);
-  
+
   // Start background server
   rm->start_background_server(&model);
 
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index a9ba9158ee..58acf3d010 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1616,7 +1616,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
                 text_str.c_str(),
                 max_seq_length);
   }
-  
+
   std::vector<GenerationResult> results = handle->generate(requests);
 
   // If the prompt exceeds max seq len, check that we return the prompt with no
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 53332791c4..53b259a703 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
   ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
 
   BeamInferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
-  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  copy_tensor_dev_to_host<float>(
+      probs.get_float_ptr(), ir.probs, batch_size * m->k);
   return ir;
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 92bafaead3..83fdbaf927 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1644,6 +1644,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     // Copy C_softmax to m->softmax_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
       MemoryAllocator *allocator = m->handle.peft_activation_allocator;
       m->softmax_activation_buffer = allocator->allocate_instance_untyped(
           sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads);
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index e87bd16699..98cd662efd 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -570,10 +570,6 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff,
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
-  size_t name_len;
-  char name[MAX_OPNAME] = {0};
-  dez.deserialize(name_len);
-  dez.deserialize(name, name_len);
 
   SigmoidSiluMultiParams params;
   params.layer_guid = layer_guid;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 34c807dee4..91a6dab9b5 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -236,8 +236,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
   // Check whether we need to reset input grads
   // We use a parallel tensor's region as the key
   std::set<LogicalRegion> reset_inputs;
-  for (int l = operators.size() - 1; l >= 0; l--) {
-    Op *op = operators[l];
+  for (int l = model->operators.size() - 1; l >= 0; l--) {
+    Op *op = model->operators[l];
     for (int i = 0; i < op->numInputs; i++) {
       assert(op->inputs[i]->region != LogicalRegion::NO_REGION);
       if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 10ce05ca1e..a64fb8ec9c 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3799,14 +3799,16 @@ bool FFModel::check_operators_integrity(
         for (int i = 0; i < fused->op_num_outputs[op]; i++) {
           int my_off = fused->op_output_idx[i + ooff];
           assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT ||
-                (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT &&
-                 (old_op->op_type == OP_RESIDUAL_LAYERNORM ||
-                  old_op->op_type == OP_RESIDUAL_RMS_NORM ||
-                  old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)));
+                 (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT &&
+                  (old_op->op_type == OP_RESIDUAL_LAYERNORM ||
+                   old_op->op_type == OP_RESIDUAL_RMS_NORM ||
+                   old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)));
           if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) {
-            assert(FusedOp::use_same_regions(fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+            assert(FusedOp::use_same_regions(
+                fused->outputs[my_off], old_op->outputs[i], pt_mapping));
           } else {
-            assert(FusedOp::use_same_regions(fused->inputs[my_off], old_op->outputs[i], pt_mapping));
+            assert(FusedOp::use_same_regions(
+                fused->inputs[my_off], old_op->outputs[i], pt_mapping));
           }
         }
         ioff += fused->op_num_inputs[op];
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7bc1966abf..41c371d4e2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -435,12 +435,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         }
       } else {
         int processed_tokens =
-          old_bc.requestsInfo[i].first_token_depth_in_request +
-          old_bc.requestsInfo[i].num_tokens_in_batch;
+            old_bc.requestsInfo[i].first_token_depth_in_request +
+            old_bc.requestsInfo[i].num_tokens_in_batch;
         assert(processed_tokens < request.tokens.size());
         bool request_completed = false;
         // printf("model_type = %d\n", this->model_type);
-        if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+        if (request.tokens.size() >=
+            old_bc.requestsInfo[i].max_sequence_length) {
           request_completed = true;
         } else if (request.tokens.back() == eos_token_id) {
           // Encounter EOS token id
@@ -469,7 +470,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           log_req_mgr.print("Final output: %s", output.c_str());
           num_processed_requests++;
           ProfileInfo profile_info = profiling_requests[request.guid];
-          profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+          profile_info.finish_time =
+              Realm::Clock::current_time_in_microseconds();
           total_request_run_time +=
               profile_info.finish_time - profile_info.start_time;
           profiling_requests[request.guid] = profile_info;
@@ -486,10 +488,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             std::ofstream outputFile(output_filepath, std::ios::app);
             if (outputFile.is_open()) {
               outputFile << "end-to-end latency: " << std::fixed
-                        << std::setprecision(3) << total_request_run_time
-                        << std::endl;
+                         << std::setprecision(3) << total_request_run_time
+                         << std::endl;
               outputFile << "num decoding steps: "
-                        << profile_info.llm_decoding_steps << std::endl;
+                         << profile_info.llm_decoding_steps << std::endl;
               outputFile << "token IDs: ";
               for (int i = 0; i < request.tokens.size(); i++) {
                 outputFile << request.tokens[i];
@@ -509,11 +511,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 
         } else {
           new_bc.request_completed[i] = false;
-          new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-          new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
+          new_bc.requestsInfo[i].first_token_depth_in_request =
+              processed_tokens;
+          new_bc.requestsInfo[i].first_token_offset_in_batch =
+              new_bc.num_tokens;
           new_bc.requestsInfo[i].request_guid =
               old_bc.requestsInfo[i].request_guid;
-          new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id;
+          new_bc.requestsInfo[i].peft_model_id =
+              old_bc.requestsInfo[i].peft_model_id;
           new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
           new_bc.requestsInfo[i].max_sequence_length =
               old_bc.requestsInfo[i].max_sequence_length;
@@ -527,10 +532,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             new_bc.requestsInfo[i].prompt_phase = false;
           } else {
             // Prompt phase
-            new_bc.requestsInfo[i].num_tokens_in_batch =
-                std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                        (int)request.tokens.size() -
-                            new_bc.requestsInfo[i].first_token_depth_in_request);
+            new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
+                get_max_tokens_per_batch() - new_bc.num_tokens,
+                (int)request.tokens.size() -
+                    new_bc.requestsInfo[i].first_token_depth_in_request);
             new_bc.requestsInfo[i].prompt_phase = true;
           }
           for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
@@ -538,7 +543,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
             new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
             assert(depth < request.tokens.size());
-            new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
+            new_bc.tokensInfo[new_bc.num_tokens].token_id =
+                request.tokens[depth];
             new_bc.num_tokens++;
           }
           // Update profiling
@@ -2399,7 +2405,8 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
   return merged_tree;
 }
 
-std::vector<GenerationResult> FFModel::generate(std::vector<Request> const &requests) {
+std::vector<GenerationResult>
+    FFModel::generate(std::vector<Request> const &requests) {
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
   for (int i = 0; i < requests.size(); i++) {

From 21b77f11c3cacb06c294bdb17a2b3be52e8fdb83 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 18:47:01 +0000
Subject: [PATCH 160/198] fix

---
 src/ops/noop.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index dabdf835dd..45bd76d59d 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -90,9 +90,10 @@ OpMeta *NoOp::init_task(Task const *task,
                         std::vector<PhysicalRegion> const &regions,
                         Context ctx,
                         Runtime *runtime) {
+  NoOp *no_op = (NoOp *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  // OpMeta *m = new OpMeta(handle);
-  return nullptr;
+  OpMeta *m = new OpMeta(handle, no_op);
+  return m;
 }
 
 void NoOp::init_inference(FFModel const &ff,
@@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff,
     set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
     IndexLauncher launcher(NOOP_INIT_TASK_ID,
                            parallel_is,
-                           TaskArgument(NULL, 0),
+                           TaskArgument(this, sizeof(NoOp)),
                            argmap,
                            Predicate::TRUE_PRED,
                            false /*must*/,
@@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) {
     set_argumentmap_for_init(ff, argmap);
     IndexLauncher launcher(NOOP_INIT_TASK_ID,
                            parallel_is,
-                           TaskArgument(NULL, 0),
+                           TaskArgument(this, sizeof(NoOp)),
                            argmap,
                            Predicate::TRUE_PRED,
                            false /*must*/,

From 9075d3fb7ea3ef893c46f554551f681a109d8f90 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 19 Feb 2024 20:29:41 +0000
Subject: [PATCH 161/198] fix

---
 python/flexflow/core/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index 2614518acf..522dbe7e44 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -88,7 +88,7 @@
     "offload_reserve_space_size": "-offload-reserve-space-size",
     "use_4bit_quantization": "--4bit-quantization",
     "use_8bit_quantization": "--8bit-quantization",
-    "enable_peft": "",
+    "enable_peft": "-enable-peft",
     "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
     "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }

From 0b35b0c16ee2bec35be2acb4c59e9e7801292b4e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 22 Mar 2024 20:19:50 +0000
Subject: [PATCH 162/198] add draft peft test

---
 tests/peft_test.sh | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100755 tests/peft_test.sh

diff --git a/tests/peft_test.sh b/tests/peft_test.sh
new file mode 100755
index 0000000000..29b3e6520c
--- /dev/null
+++ b/tests/peft_test.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+# Token to access private huggingface models (e.g. LLAMA-2)
+HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none}
+if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then
+    huggingface-cli login --token "$HUGGINGFACE_TOKEN"
+fi
+
+# Create test prompt file
+mkdir -p ../inference/prompt
+echo '["Two things are infinite: "]' > ../inference/prompt/peft.json
+
+# Create output folder
+mkdir -p ../inference/output
+
+# Enable backtrace in case we run into a segfault or assertion failure
+export LEGION_BACKTRACE=1
+
+# Download test model
+python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m 
+# if first time, add: --refresh-cache
+
+./inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft

From b6ada2f9b9df6ce00c0c2a2b00d6bf3ac81dea2d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 27 Mar 2024 14:47:31 -0400
Subject: [PATCH 163/198] Peft python interface (#1306)

* update script

* less model renaming

* fix

* fix

* fix

* backup

* .

* update

* .

* fixes

* fix

* fix build

* fix

* fix

* fix issues for downloading peft model

* solved issues for download peft model

* added printouts for debugging

* fix

* fix seg fault

* add test, separate peft script in cpp

* fix

* fixes

* fix

* update peft python interface

* update

* update

* update

* updates

* fix

* fixes

* fix

* fixes

---------

Co-authored-by: april-yyt <aprilytyang@gmail.com>
---
 CMakeLists.txt                            |    1 +
 include/flexflow/ffconst.h                |    8 +-
 include/flexflow/flexflow_c.h             |   36 +-
 include/flexflow/model.h                  |   15 +-
 include/flexflow/ops/lora_linear.h        |   26 +-
 include/flexflow/ops/lora_linear_params.h |    4 +-
 include/flexflow/request_manager.h        |    4 +-
 inference/incr_decoding/incr_decoding.cc  |   43 +-
 inference/models/falcon.cc                |   34 +-
 inference/models/llama.cc                 |  110 +-
 inference/models/mpt.cc                   |   35 +-
 inference/models/opt.cc                   |   43 +-
 inference/models/starcoder.cc             |   26 +-
 inference/peft/CMakeLists.txt             |   38 +
 inference/peft/Makefile                   |   37 +
 inference/peft/peft.cc                    |  348 ++
 inference/python/ff_peft.py               |  148 +
 inference/utils/download_peft_model.py    |   28 +-
 python/flexflow/core/flexflow_cffi.py     | 4819 +++++++++++----------
 python/flexflow/serve/__init__.py         |   15 +-
 python/flexflow/serve/models/falcon.py    |   35 +-
 python/flexflow/serve/models/llama.py     |   43 +-
 python/flexflow/serve/models/mpt.py       |   37 +-
 python/flexflow/serve/models/opt.py       |   40 +-
 python/flexflow/serve/models/starcoder.py |   47 +-
 python/flexflow/serve/serve.py            |  481 +-
 python/flexflow/type.py                   |    3 +
 src/c/flexflow_c.cc                       |  145 +-
 src/ops/fused.cu                          |    6 +-
 src/ops/inc_multihead_self_attention.cu   |    3 +-
 src/ops/lora_linear.cc                    |  479 +-
 src/ops/lora_linear_params.cc             |   28 +-
 src/runtime/ffconst_utils.cc              |    6 +-
 src/runtime/file_loader.cc                |   90 +-
 src/runtime/graph.cc                      |    3 +-
 src/runtime/inference_manager.cc          |   44 +
 src/runtime/model.cc                      |   19 +-
 src/runtime/request_manager.cc            |  215 +-
 tests/peft/hf_serve.py                    |   70 +-
 tests/peft_test.sh                        |    6 +-
 40 files changed, 4228 insertions(+), 3390 deletions(-)
 create mode 100644 inference/peft/CMakeLists.txt
 create mode 100644 inference/peft/Makefile
 create mode 100644 inference/peft/peft.cc
 create mode 100644 inference/python/ff_peft.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43ce4f7044..22770b6c28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -558,6 +558,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index fb12adf2d3..016dd7bdd1 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -78,6 +78,11 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
+enum RequestType {
+  REQ_INFERENCE = 4001,
+  REQ_FINETUNING = 4002,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
@@ -179,8 +184,7 @@ enum OperatorType {
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
   // PEFT Ops
-  OP_LORA_MLP_FIRST,
-  OP_LORA_MLP_SECOND,
+  OP_LORA,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index b7b20f2d2f..004523e875 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -55,6 +55,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
 FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -593,6 +595,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
 
@@ -616,10 +621,13 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
-                             char const **input_text,
-                             int max_num_chars,
-                             char **output_text,
-                             int max_seq_length,
+                             enum RequestType *request_types,
+                             char const **input_texts,
+                             char **output_texts,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
                              int **output_length_and_tokens);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
@@ -1036,6 +1044,26 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_);
 
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 34ace0c5dc..099e2209e4 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -837,19 +837,12 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-  void lora_linear(Tensor const input,
-                   Tensor const output,
-                   OperatorType _type,
-                   char const *name = nullptr);
+  PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
   // ========================================
   // Inference APIs
   // ========================================
   std::vector<GenerationResult> generate(std::vector<Request> const &requests);
 
-  PEFTModelID register_peft_model(
-      LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig,
-      LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig);
-
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
                                        DataType data_type,
@@ -1174,6 +1167,12 @@ class FFModel {
   std::vector<Layer *> layers;
   std::vector<Op *> operators;
   std::vector<ParallelTensor> parameters;
+  // PEFT related
+  std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
+  std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  //   std::vector<Op *> peft_operators;
+
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
   // Cached operators: key: operator hash, value: operator pointer
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index b9aabdd1aa..9e83c3f90e 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -17,12 +17,14 @@ class LoraLinear : public Op {
   using Params = LoraLinearParams;
   using Input = std::pair<ParallelTensor, ParallelTensor>;
 
-  LoraLinear(FFModel &model,
-             LayerID const &layer_guid,
-             OperatorType type,
-             ParallelTensor const input,
-             ParallelTensor const output,
-             char const *name = nullptr);
+  LoraLinear(
+      FFModel &model,
+      LayerID const &layer_guid,
+      OperatorType type,
+      ParallelTensor const input,
+      ParallelTensor const output,
+      std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+      char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
              ParallelTensor const input,
@@ -39,11 +41,6 @@ class LoraLinear : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
-  void register_peft_model(FFModel const &ff,
-                           std::vector<ParallelTensor> const &batch_inputs,
-                           std::vector<ParallelTensor> const &batch_outputs,
-                           PEFTModelID const &model_id,
-                           LoraLinearConfig const lora_config);
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -64,11 +61,6 @@ class LoraLinear : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static void
-      register_model_task(Legion::Task const *task,
-                          std::vector<Legion::PhysicalRegion> const &regions,
-                          Legion::Context ctx,
-                          Legion::Runtime *runtime);
   static void inference_task(Legion::Task const *task,
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
@@ -98,6 +90,8 @@ class LoraLinear : public Op {
                   int num_inputs) const override;
   // size_t get_params_hash() const override;
   LoraLinearParams get_params() const;
+
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index e82243fd67..ff041334f1 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
 
 class LoraLinearConfig {
 public:
-  static const LoraLinearConfig DefaultConfig;
+  static const LoraLinearConfig EmptyConfig;
   LoraLinearConfig();
   LoraLinearConfig(int rank,
                    OptimizerType type = OPTIMIZER_TYPE_SGD,
@@ -33,6 +33,7 @@ class LoraLinearConfig {
   std::string peft_model_id;
   int lora_alpha;
   float lora_dropout;
+  std::vector<std::string> target_modules;
   // whether to load weights from file, instead of initializing them randomly
   bool load_weights_from_file;
 };
@@ -41,6 +42,7 @@ class LoraLinearParams {
 public:
   LayerID layer_guid;
   OperatorType type;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   char name[MAX_OPNAME];
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 0e59888888..bf6e475cbb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -65,7 +65,6 @@ struct Request {
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
   };
-  enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   int max_sequence_length = 128;
@@ -81,10 +80,11 @@ struct Request {
   RequestType req_type = REQ_INFERENCE;
   int completed_training_steps = 0;
   int max_training_steps = 1;
-  std::vector<std::pair<std::string, std::string>> dataset_text;
+  std::string dataset_filepath;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
       dataset;
+  friend std::ostream &operator<<(std::ostream &os, Request const &req);
 };
 
 // store the result of beam search
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index d376c3e39c..c3993b1ad4 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -40,7 +40,6 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       std::string &llm_model_name,
-                      std::string &peft_model_name,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
@@ -58,13 +57,6 @@ void parse_input_args(char **argv,
       }
       continue;
     }
-    if (!strcmp(argv[i], "-peft-model")) {
-      peft_model_name = std::string(argv[++i]);
-      for (char &c : peft_model_name) {
-        c = std::tolower(c);
-      }
-      continue;
-    }
     // cache folder
     if (!strcmp(argv[i], "-cache-folder")) {
       paths.cache_folder_path = std::string(argv[++i]);
@@ -133,7 +125,7 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "Doesn't support quantization in non-offload mode");
   }
   FilePaths file_paths;
-  std::string llm_model_name, peft_model_name;
+  std::string llm_model_name;
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
@@ -150,7 +142,6 @@ void FlexFlow::top_level_task(Task const *task,
                    argc,
                    file_paths,
                    llm_model_name,
-                   peft_model_name,
                    use_full_precision,
                    verbose,
                    do_sample,
@@ -159,6 +150,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length);
+
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -259,19 +251,6 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
-  // Register PEFT layer
-  LoraLinearConfig mlp_second =
-      peft_model_name.empty()
-          ? LoraLinearConfig::DefaultConfig
-          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
-  PEFTModelID peft_model_id =
-      peft_model_name.empty()
-          ? PEFTModelID::NO_ID
-          : model.register_peft_model(
-                LoraLinearConfig::DefaultConfig /*mlp_first*/,
-                mlp_second /*mlp_second*/);
-
-  // Start background server
   rm->start_background_server(&model);
 
   int total_num_requests = 0;
@@ -288,20 +267,10 @@ void FlexFlow::top_level_task(Task const *task,
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      // Add inference request
-      // Request inference_req;
-      // inference_req.prompt = text;
-      // inference_req.max_sequence_length = 128;
-      // inference_req.peft_model_id = peft_model_id;
-      // requests.push_back(inference_req);
-      // total_num_requests++;
-      // Add fine-tuning request
-      Request fine_tuning_req;
-      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = 128;
-      fine_tuning_req.peft_model_id = peft_model_id;
-      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
-      requests.push_back(fine_tuning_req);
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
     }
     std::vector<GenerationResult> result = model.generate(requests);
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index f86130ff2b..195d6ba7e3 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_layer_norm(
@@ -91,7 +91,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           true,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = res_ln_outputs[0];
       att_norm = res_ln_outputs[1];
@@ -117,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -142,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -188,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h")
             .c_str());
 
     dense_h_to_4h = ff.gelu(dense_h_to_4h);
@@ -204,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h")
             .c_str());
   }
   // final normalization and linear
@@ -254,26 +254,6 @@ void FALCON::create_falcon_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            falcon_config.n_head,
-                            falcon_config.n_head_kv,
-                            falcon_config.hidden_size,
-                            falcon_config.hidden_size / falcon_config.n_head,
-                            ff.config.tensor_parallelism_degree);
-  std::cout << "------load weights ----------" << std::endl;
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 0db7796567..4be232e81b 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "tok_embeddings");
+                              "embed_tokens");
 
   Tensor w2 = nullptr;
 
@@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_rms_norm(
@@ -86,7 +86,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.hidden_size,
           false, // inplace_residual
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
@@ -112,7 +112,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -135,7 +135,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -158,7 +158,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -178,60 +178,54 @@ void LLAMA::create_llama_model(FFModel &ff,
         llama_config.hidden_size,
         false, // inplace_residual
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
+        std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
+            .c_str());
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
-                     .c_str());
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
-                     .c_str());
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str());
 
     Tensor multi = ff.sigmoid_silu_multi(w1, w3);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
-                     .c_str());
-    // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(
+    w2 = ff.dense(
         multi,
-        w2,
-        OP_LORA_MLP_SECOND,
-        std::string("layers_" + std::to_string(i) + "_feed_forward_w2_lora")
-            .c_str());
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    // ff.lora_linear(std::string("down_proj"), std::string("layers." +
+    // std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -254,7 +248,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                           nullptr,
                           REG_MODE_NONE,
                           0.0f,
-                          "output");
+                          "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -288,16 +282,6 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 95179691a1..e4a7e0056d 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff,
                                       use_full_precision ? DT_FLOAT : DT_HALF,
                                       NULL,
                                       embed_init,
-                                      "transformer_wte");
+                                      "wte");
 
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
     } else {
       ff.residual_layer_norm(
           intermediate_output,
@@ -88,7 +88,7 @@ void MPT::create_mpt_model(FFModel &ff,
           false,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
       hidden_states = res_ln_outputs[0];
       layernorm_output = res_ln_outputs[1];
     }
@@ -114,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -138,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -162,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -184,7 +184,7 @@ void MPT::create_mpt_model(FFModel &ff,
         false,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
+        std::string("layers." + std::to_string(i) + ".norm_2").c_str());
     hidden_states = res_ln_outputs[0];
     layernorm_output = res_ln_outputs[1];
 
@@ -200,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str());
     layernorm_output = ff.gelu(layernorm_output);
     intermediate_output = ff.dense(
         layernorm_output,
@@ -213,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str());
   }
 
   // final
@@ -228,7 +228,7 @@ void MPT::create_mpt_model(FFModel &ff,
                          false,
                          false,
                          DT_NONE,
-                         "transformer_norm_f");
+                         "norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(all_final_norm,
@@ -262,21 +262,6 @@ void MPT::create_mpt_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            mpt_config.n_heads,
-                            mpt_config.n_heads,
-                            mpt_config.hidden_size,
-                            mpt_config.hidden_size / mpt_config.n_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 7d2abad829..b3f2ef4e17 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -96,7 +96,7 @@ void OPT::create_opt_model(FFModel &ff,
         true,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
+        std::string("layers." + std::to_string(i) + ".self_attn_layer_norm")
             .c_str());
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
@@ -122,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -146,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -170,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -189,8 +189,8 @@ void OPT::create_opt_model(FFModel &ff,
                                     true,
                                     false,
                                     DT_NONE,
-                                    std::string("layers_" + std::to_string(i) +
-                                                "_add_bias_residual_layer_norm")
+                                    std::string("layers." + std::to_string(i) +
+                                                ".add_bias_residual_layer_norm")
                                         .c_str());
     added = res_ln_outputs[0];
     Tensor final_norm = res_ln_outputs[1];
@@ -207,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_fc1").c_str());
+                 std::string("layers." + std::to_string(i) + ".fc1").c_str());
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -218,13 +218,10 @@ void OPT::create_opt_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+                   std::string("layers." + std::to_string(i) + ".fc2").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
-    ff.lora_linear(
-        fc1,
-        fc2,
-        OP_LORA_MLP_SECOND,
-        std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str());
+    // ff.lora_linear(std::string("fc2"), std::string("layers." +
+    // std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
@@ -252,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff,
                             nullptr,
                             REG_MODE_NONE,
                             0.0f,
-                            "embed_tokens_weight_lm_head");
+                            "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -276,24 +273,6 @@ void OPT::create_opt_model(FFModel &ff,
       use_full_precision);
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            opt_config.num_attention_heads,
-                            opt_config.num_attention_heads,
-                            opt_config.hidden_size,
-                            opt_config.hidden_size /
-                                opt_config.num_attention_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------finished loading weights----------" << std::endl;
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index fb6269ad75..cd8bf3a9a7 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model(
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "transformer_wte");
+                              "wte");
 
   Tensor positional_embedding =
       ff.embedding(position_input,
@@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model(
                    use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
-                   "transformer_wpe");
+                   "wpe");
 
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -98,7 +98,7 @@ void STARCODER::create_starcoder_model(
         true,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
@@ -125,7 +125,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
         break;
@@ -147,7 +147,7 @@ void STARCODER::create_starcoder_model(
         true,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_2").c_str());
     residual = res_ln_outputs[0];
     Tensor l2_norm = res_ln_outputs[1];
 
@@ -163,7 +163,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str());
 
     c_fc = ff.gelu(c_fc);
 
@@ -178,7 +178,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str());
   }
   // final normalization and linear
   ff.residual_layer_norm(residual,
@@ -192,7 +192,7 @@ void STARCODER::create_starcoder_model(
                          true,
                          false,
                          DT_NONE,
-                         "transformer_ln_f");
+                         "ln_f");
   Tensor ln_f = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(ln_f,
@@ -235,16 +235,6 @@ void STARCODER::create_starcoder_model(
       ff.config.tensor_parallelism_degree,
       use_full_precision);
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
new file mode 100644
index 0000000000..4547907176
--- /dev/null
+++ b/inference/peft/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_Peft)
+set(project_target peft)
+
+
+set(CPU_SRC
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target} ${CPU_SRC})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target} ${CPU_SRC})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/Makefile b/inference/peft/Makefile
new file mode 100644
index 0000000000..0e4b79f51f
--- /dev/null
+++ b/inference/peft/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
new file mode 100644
index 0000000000..eade2eaeeb
--- /dev/null
+++ b/inference/peft/peft.cc
@@ -0,0 +1,348 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+LegionRuntime::Logger::Category log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    paths.cache_folder_path = "~/.cache/flexflow";
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  int total_num_requests = 0;
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    // for (auto &prompt : prompt_json) {
+    //   std::string text = prompt.get<std::string>();
+    //   printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+    //   Request inference_req;
+    //   inference_req.prompt = text;
+    //   inference_req.max_sequence_length = 128;
+    //   inference_req.peft_model_id = peft_model_id;
+    //   requests.push_back(inference_req);
+    //   total_num_requests++;
+    // }
+
+    // Add fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.max_sequence_length = 128;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    total_num_requests++;
+
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
new file mode 100644
index 0000000000..38a25fb614
--- /dev/null
+++ b/inference/python/ff_peft.py
@@ -0,0 +1,148 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+        type=str,
+        default="",
+    )
+    args = parser.parse_args()
+
+    # Load configs from JSON file (if specified)
+    if len(args.config_file) > 0:
+        if not os.path.isfile(args.config_file):
+            raise FileNotFoundError(f"Config file {args.config_file} not found.")
+        try:
+            with open(args.config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 1,
+            "memory_per_gpu": 8192,
+            "zero_copy_memory_per_node": 12000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 4,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 1,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": True,
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
+            "profiling": False,
+            "inference_debugging": True,
+            "fusion": True,
+        }
+        model_configs = {
+            # required parameters
+            "base_model": "JackFram/llama-160m",
+            "peft_model_ids": [
+                "goliaro/llama-160m-lora-full",
+            ],
+            # optional parameters
+            "cache_path": "",
+            "refresh_cache": False,
+            "full_precision": False,
+            "prompt": "",
+            "finetuning_dataset": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json"
+            ),
+            "output_file": "",
+        }
+        # Merge dictionaries
+        ff_init_configs.update(model_configs)
+        return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+    for peft_model_id in configs.peft_model_ids:
+        llm.add_peft(peft_model_id)
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=256,
+        max_tokens_per_batch=64,
+    )
+
+    llm.start_server()
+
+    requests = []
+    # Serving
+    if len(configs.prompt) > 0:
+        prompts = [s for s in json.load(open(configs.prompt))]
+        inference_requests = [
+            ff.Request(
+                ff.RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128
+            )
+            for prompt in prompts
+        ]
+        requests += inference_requests
+    # Finetuning
+    if len(configs.finetuning_dataset) > 0:
+        for peft_model_id in configs.peft_model_ids:
+            finetuning_request = ff.Request(
+                ff.RequestType.REQ_FINETUNING,
+                max_sequence_length=128,
+                peft_model_id=llm.get_ff_peft_id(peft_model_id),
+                dataset_filepath=configs.finetuning_dataset,
+            )
+            requests.append(finetuning_request)
+
+    llm.generate(requests)
+
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow PEFT example")
+    main()
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index 5c7704b6f0..ad79816f84 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -6,7 +6,10 @@
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download"
+        "--base_model_name", type=str, help="Name of the model to download"
+    )
+    parser.add_argument(
+        "peft_model_ids", type=str, nargs="+", help="Name of the PEFT model(s) to download"
     )
     parser.add_argument(
         "--cache-folder",
@@ -42,16 +45,19 @@ def main(args):
     else:
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 
-    for peft_model_id in args.peft_model_ids:
-        for data_type in data_types:
-            peft = ff.PEFT(
-                peft_model_id,
-                data_type=data_type,
-                cache_path=args.cache_folder,
-                refresh_cache=args.refresh_cache,
-            )
-            peft.download_hf_weights_if_needed()
-            peft.download_hf_config()
+    
+    for data_type in data_types:
+        llm = ff.LLM(
+            args.base_model_name,
+            data_type=data_type,
+            cache_path=args.cache_folder,
+            refresh_cache=args.refresh_cache,
+        )
+        for peft_model_id in args.peft_model_ids:
+            llm.add_peft(peft_model_id)
+        llm.download_hf_weights_if_needed()
+        llm.download_hf_config()
+        llm.download_hf_tokenizer_if_needed()
 
 
 if __name__ == "__main__":
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index b92a0a92af..82c3eb059c 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -28,6 +28,7 @@
     CompMode,
     MetricsType,
     InferenceMode,
+    RequestType,
     ModelType,
     OpType,
     ParameterSyncType,
@@ -36,6 +37,7 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
+from typing import Union, List
 
 
 def ffc():
@@ -1243,615 +1245,646 @@ def get_weights(self, ffmodel):
 
 
 # -----------------------------------------------------------------------
-# FFModel
+# SGDOptimizer
 # -----------------------------------------------------------------------
 
 
-class FFModel(object):
-    """ """
+class SGDOptimizer(object):
+    __slots__ = ["handle", "_handle"]
 
-    __slots__ = [
-        "handle",
-        "_handle",
-        "_layers",
-        "_nb_layers",
-        "_ffconfig",
-        "_tracing_id",
-        "initializers",
-        "attr_tensors",
-    ]
+    def __init__(
+        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
+    ):
+        self.handle = ffc().flexflow_sgd_optimizer_create(
+            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
 
-    def __init__(self, ffconfig):
-        """Constructor of FFModel.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
 
-        :param ffconfig: configurations of FlexFlow and the created model.
-        :type ffconfig: FFConfig
 
-        :returns:  FFModel -- the model.
-        """
-        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
-        self._layers = dict()
-        self._nb_layers = 0
-        self._ffconfig = ffconfig
-        global ff_tracing_id
-        self._tracing_id = ff_tracing_id
-        ff_tracing_id += 1
-        self.initializers = {}
-        self.attr_tensors = {}
+# -----------------------------------------------------------------------
+# AdamOptimizer
+# -----------------------------------------------------------------------
 
-    def get_layers(self):
-        return self._layers
 
-    def add_layer(self, op_type, name):
-        layer_id = self._nb_layers
-        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
-        self._layers[self._nb_layers] = convert_op_handle_to_op(
-            op_type, op_handle, idx=layer_id, name=name
+class AdamOptimizer(object):
+    __slots__ = ["handle", "_handle"]
+
+    def __init__(
+        self,
+        ffmodel,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        weight_decay=0.0,
+        epsilon=1e-8,
+    ):
+        self.handle = ffc().flexflow_adam_optimizer_create(
+            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
         )
-        self._nb_layers += 1
+        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
 
-    def create_tensor(self, dims, data_type, create_grad=True):
-        """Instantiate a FlexFlow tensor.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
 
-        :param x: a shape tuple/list (integers), including the batch size.
-        :type x: list of int
 
-        :param data_type: the datatype of the created tensor. Options are
-          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
-        :type data_type: DataType
+# -----------------------------------------------------------------------
+# Initializer
+# -----------------------------------------------------------------------
+class Initializer(object):
+    __slots__ = ["handle", "p_handle"]
 
-        :param create_grad: weather the tensor creates a gradients vector.
-          If you don't specify anything, a gradients vector is used.
-        :type create_grad: bool
+    def __init__(self, handle, p_handle=0):
+        self.p_handle = ffi.new("flexflow_initializer_t *")
+        if handle == None:
+            self.p_handle.impl = ffi.NULL
+        else:
+            self.p_handle.impl = handle.impl
+        self.handle = self.p_handle[0]
+        assert ffi.typeof(self.handle) == ffi.typeof(
+            "flexflow_initializer_t"
+        ), "Initializer handle is wrong"
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_tensor_create(
-            self.handle, num_dims, c_dims, c_data_type, create_grad
-        )
-        return Tensor(handle)
 
-    def map_tensor(self, tensor, parallel_op=None):
-        op_handle = self.__get_op_handle(parallel_op)
-        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
+# -----------------------------------------------------------------------
+# GlorotUniform
+# -----------------------------------------------------------------------
 
-    def create_constant(self, dims, value, data_type):
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_constant_create(
-            self.handle, num_dims, c_dims, value, c_data_type
-        )
-        return Tensor(handle)
 
-    def exp(self, x, name=None):
-        """Exponential activation function.
+class GlorotUniformInitializer(Initializer):
+    __slots__ = ["glorot_handle", "_glorot_handle"]
 
-        :param x: the input Tensor.
-        :type x: Tensor
+    def __init__(self, seed):
+        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
+        self._glorot_handle = ffi.gc(
+            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        )
+        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
-        self.add_layer(OpType.EXP, name)
-        return Tensor(handle, owner_op_type=OpType.EXP)
+# -----------------------------------------------------------------------
+# ZeroInitializer
+# -----------------------------------------------------------------------
 
-    def sin(self, x, name=None):
-        """Elementwise sine function.
 
-        :param x: the input Tensor.
-        :type x: Tensor
+class ZeroInitializer(Initializer):
+    __slots__ = ["zero_handle", "_zero_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.zero_handle = ffc().flexflow_zero_initializer_create()
+        self._zero_handle = ffi.gc(
+            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        )
+        super(ZeroInitializer, self).__init__(self.zero_handle)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
-        self.add_layer(OpType.SIN, name)
-        return Tensor(handle, owner_op_type=OpType.SIN)
 
-    def cos(self, x, name=None):
-        """Elementwise cosine function.
+# -----------------------------------------------------------------------
+# UniformInitializer
+# -----------------------------------------------------------------------
 
-        :param x: the input Tensor.
-        :type x: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class UniformInitializer(Initializer):
+    __slots__ = ["uniform_handle", "_uniform_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
-        self.add_layer(OpType.COS, name)
-        return Tensor(handle, owner_op_type=OpType.COS)
+    def __init__(self, seed, minv, maxv):
+        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
+            seed, minv, maxv
+        )
+        self._uniform_handle = ffi.gc(
+            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        )
+        super(UniformInitializer, self).__init__(self.uniform_handle)
 
-    def add(self, x, y, inplace_a=False, name=None):
-        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# NormInitializer
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class NormInitializer(Initializer):
+    __slots__ = ["norm_handle", "_norm_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_add(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+    def __init__(self, seed, mean, stddev):
+        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
+        self._norm_handle = ffi.gc(
+            self.norm_handle, ffc().flexflow_norm_initializer_destroy
         )
-        self.add_layer(OpType.ADD, name)
-        return Tensor(handle, owner_op_type=OpType.ADD)
-
-    def subtract(self, x, y, inplace_a=False, name=None):
-        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
+        super(NormInitializer, self).__init__(self.norm_handle)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+# -----------------------------------------------------------------------
+# PerfMetrics
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_subtract(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.SUBTRACT, name)
-        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
+class PerfMetrics(object):
+    __slots__ = ["handle", "_handle"]
 
-    def multiply(self, x, y, inplace_a=False, name=None):
-        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
+    def __init__(self, handle):
+        self.handle = handle
+        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+    def get_accuracy(self):
+        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+# -----------------------------------------------------------------------
+# NetConfig
+# -----------------------------------------------------------------------
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_multiply(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-    def divide(self, x, y, inplace_a=False, name=None):
-        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
-
-        :param x: the first input Tensor.
-        :type x: Tensor
-
-        :param y: the second input Tensor.
-        :type y: Tensor
+class NetConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_net_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
+        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cpath)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_divide(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.DIVIDE, name)
-        return Tensor(handle, owner_op_type=OpType.DIVIDE)
+# -----------------------------------------------------------------------
+# DLRMConfig
+# -----------------------------------------------------------------------
 
-    def max(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+class DLRMConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_dlrm_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cstr)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
+        self.arch_interaction_op = ffi.string(cstr)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_max(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
+            self.handle
         )
-        self.add_layer(OpType.MAX, name)
-        return Tensor(handle, owner_op_type=OpType.MAX)
+        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
+        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
+        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
+            self.handle
+        )
+        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
 
-    def min(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
+        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
+        self.mlp_bot = []
+        for i in range(0, mlp_bot_c[0]):
+            self.mlp_bot.append(mlp_bot_c[i + 1])
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
+        self.mlp_top = []
+        for i in range(0, mlp_top_c[0]):
+            self.mlp_top.append(mlp_top_c[i + 1])
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
+        self.embedding_size = []
+        for i in range(0, embedding_size_c[0]):
+            self.embedding_size.append(embedding_size_c[i + 1])
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_min(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MIN, name)
-        return Tensor(handle, owner_op_type=OpType.MIN)
+# -----------------------------------------------------------------------
+# Single DataLoader
+# -----------------------------------------------------------------------
 
-    def reduce_sum(self, input, axes, keepdims=False, name=None):
-        """Layer that computes the sum of the input Tensor along given axes.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+class SingleDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param axes: the axes along which reduction is applied
-        :type axes: List[int]
+    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
+        assert type(input) is Tensor, "SingleDataLoader input is wrong"
+        if type(full_input) is Tensor:
+            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
+        else:
+            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create(
+            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
+        )
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_reduce_sum(
-            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
+    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
+        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create2(
+            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
         )
-        self.add_layer(OpType.REDUCE_SUM, name)
-        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def rsqrt(self, input, name=None):
-        """Layer that computes the element-wise reciprocal square-root.
+    @property
+    def num_samples(self):
+        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    @num_samples.setter
+    def num_samples(self, samples):
+        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def next_batch(self, ffmodel):
+        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
 
-        :returns:  Tensor -- the output tensor.
+        :returns:  None -- no returns.
         """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
-        self.add_layer(OpType.RSQRT, name)
-        return Tensor(handle, owner_op_type=OpType.RSQRT)
+        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
 
-    def pow(self, input, exponent, name=None):
-        """Layer that computes the element-wise power.
+    def reset(self):
+        """Reset the current position of the dataloder to 0.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_single_dataloader_reset(self.handle)
 
-        :param exponent: exponent to raise each element in the input tensor.
-        :type exponent: float
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class RegionNdarray(object):
+    __slots__ = ["__array_interface__"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_pow(
-            self.handle, input.handle, exponent, c_name
-        )
-        self.add_layer(OpType.POW, name)
-        return Tensor(handle, owner_op_type=OpType.POW)
+    def __init__(self, shape, data_type, base_ptr, strides, read_only):
+        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
+        if data_type == DataType.DT_HALF:
+            field_type = "<f2"
+        elif data_type == DataType.DT_FLOAT:
+            field_type = "<f4"
+        elif data_type == DataType.DT_INT32:
+            field_type = "<i4"
+        else:
+            assert 0, "unknown data type"
+            field_type = "<f4"
+        self.__array_interface__ = {
+            "version": 3,
+            "shape": shape,
+            "typestr": field_type,
+            "data": (base_ptr, read_only),
+            "strides": strides,
+        }
 
-    def mean(self, input, dims, keepdims=False, name=None):
-        """Layer that computes the mean of the input tensor across the given
-        dimensions.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+# -----------------------------------------------------------------------
+# BatchConfig
+# -----------------------------------------------------------------------
 
-        :param dims: dimensions to take the mean over.
-        :type dims: list
 
-        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
-                         collapses the dimension if False. Default is False.
-        :type keepdims: bool
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.handle = ffc().flexflow_batch_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        dims = list(dims)
-        c_dims = ffi.new("int[]", dims)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_mean(
-            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
-        )
-        self.add_layer(OpType.MEAN, name)
-        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-    def conv2d(
-        self,
-        input,
-        out_channels,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        activation=ActiMode.AC_MODE_NONE,
-        groups=1,
-        use_bias=True,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        name=None,
-    ):
-        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
-        to produce a tensor of :attr:`output`.
+# -----------------------------------------------------------------------
+# TreeVerifyBatchConfig
+# -----------------------------------------------------------------------
 
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        .. math::
-          C_{out} = out\_channels
+class TreeVerifyBatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          K_{H} = kernel\_h
+    def __init__(self):
+        self.handle = ffc().flexflow_tree_verify_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
+        )
 
-        .. math::
-          K_{W} = kernel\_w
 
-        .. math::
-          S_{H} = stride\_h
+# -----------------------------------------------------------------------
+# BeamSearchBatchConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          S_{W} = stride\_w
 
-        .. math::
-          P_{H} = padding\_h
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          P_{S} = padding\_s
+    def __init__(self):
+        self.handle = ffc().flexflow_beam_search_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_beam_search_batch_config_destroy
+        )
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
+# -----------------------------------------------------------------------
+# RequestManager
+# -----------------------------------------------------------------------
 
-        :param input: the input Tensor.
-        :type input: Tensor
 
-        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
-        :type out\_channels: int
+class RequestManager(object):
+    __slots__ = ["handle"]
 
-        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
-        :type kernel_h: int
+    def __init__(self):
+        self.handle = ffc().flexflow_request_manager_get_request_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
 
-        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
-        :type kernel_w: int
+    def register_tokenizer(
+        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
+    ):
+        c_model_type = enum_to_int(ModelType, model_type)
+        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
+        return ffc().flexflow_request_manager_register_tokenizer(
+            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+        )
 
-        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
-        :type stride_h: int
+    def register_output_filepath(self, output_filepath):
+        c_output_filepath = get_c_name(output_filepath)
+        return ffc().flexflow_request_manager_register_output_filepath(
+            self.handle, c_output_filepath
+        )
 
-        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
-        :type stride_w: int
+    def register_ssm_model(self, model):
+        return ffc().flexflow_request_manager_register_ssm_model(
+            self.handle, model.handle
+        )
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    def set_max_requests_per_batch(self, max_requests):
+        return ffc().flexflow_request_manager_set_max_requests_per_batch(
+            self.handle, max_requests
+        )
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+    def set_max_tokens_per_batch(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
+            self.handle, max_tokens
+        )
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+    def set_max_sequence_length(self, max_length):
+        return ffc().flexflow_request_manager_set_max_sequence_length(
+            self.handle, max_length
+        )
 
-        :param groups: the number of groups in this convolution
-        :type groups: int
+    def start_server(self, model):
+        return ffc().flexflow_request_manager_start_background_server(
+            self.handle, model.handle
+        )
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+    def stop_server(self):
+        return ffc().flexflow_request_manager_terminate_background_server(self.handle)
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+# -----------------------------------------------------------------------
+# InferenceManager
+# -----------------------------------------------------------------------
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class InferenceManager(object):
+    __slots__ = ["handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_conv2d(
-            self.handle,
-            input.handle,
-            out_channels,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_activation,
-            groups,
-            use_bias,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_name,
+    def __init__(self):
+        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
+
+    def compile_model_and_allocate_buffer(self, model):
+        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
+            self.handle, model.handle
         )
-        self.add_layer(OpType.CONV2D, name)
-        return Tensor(handle, owner_op_type=OpType.CONV2D)
 
-    def embedding(
-        self,
-        input,
-        num_embeddings,
-        embedding_dim,
-        aggr,
-        dtype=DataType.DT_FLOAT,
-        shared_op=None,
-        kernel_initializer=None,
-        name=None,
-    ):
-        """Layer that turns positive integers into dense vectors of fixed size
+    def init_operators_inference(self, model):
+        ffc().flexflow_inference_manager_init_operators_inference(
+            self.handle, model.handle
+        )
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def register_model_weights_loader(self, model, fileloader):
+        ffc().flexflow_inference_manager_register_model_weights_loader(
+            self.handle, model.handle, fileloader.handle
+        )
 
-        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
-        :type num_embeddings: int
 
-        :param embedding_dim: dimension of the dense embedding.
-        :type embedding_dim: int
+# -----------------------------------------------------------------------
+# FileDataLoader
+# -----------------------------------------------------------------------
 
-        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
-        :type aggr: AggrMode
 
-        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
-        :type dtype: DataType
+class FileDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+    def __init__(
+        self,
+        weight_file_path,
+        num_q_heads,
+        num_kv_heads,
+        hidden_dim,
+        qkv_inner_dim,
+        tensor_parallelism_degree,
+        use_full_precision,
+    ):
+        c_weight_file_path = get_c_name(weight_file_path)
+        self.handle = ffc().flexflow_file_data_loader_create(
+            c_weight_file_path,
+            num_q_heads,
+            num_kv_heads,
+            hidden_dim,
+            qkv_inner_dim,
+            tensor_parallelism_degree,
+            use_full_precision,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def load_weights(self, model):
+        # Check data type and create use_full_precision boolean
+        # assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
+        # use_full_precision = data_type == DataType.DT_FLOAT
+        ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_aggr = enum_to_int(AggrMode, aggr)
-        c_dtype = enum_to_int(DataType, dtype)
-        if kernel_initializer is None:
-            kernel_initializer = GlorotUniformInitializer(42)
-        assert (
-            (type(kernel_initializer) is GlorotUniformInitializer)
-            or (type(kernel_initializer) is ZeroInitializer)
-            or (type(kernel_initializer) is UniformInitializer)
-            or (type(kernel_initializer) is NormInitializer)
-        ), f"Unknown initializer type: {kernel_initializer}"
-        handle = ffc().flexflow_model_add_embedding(
-            self.handle,
-            input.handle,
-            num_embeddings,
-            embedding_dim,
-            c_aggr,
-            c_dtype,
-            shared_op_handle,
-            kernel_initializer.handle,
-            c_name,
-        )
-        # NOTE: We must keep a reference to the initializer or else it will be
-        # immediately destructed
-        self.initializers[name] = kernel_initializer
-        self.add_layer(OpType.EMBEDDING, name)
-        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
+# -----------------------------------------------------------------------
+# GenerationConfig
+# -----------------------------------------------------------------------
 
-    def pool2d(
+
+class GenerationConfig(object):
+    """A class to store the sampling configs."""
+
+    def __init__(
         self,
-        input,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        pool_type=PoolType.POOL_MAX,
-        activation=ActiMode.AC_MODE_NONE,
-        name=None,
+        do_sample: bool = False,
+        temperature: float = 0.9,
+        topp: float = 0.8,
+        topk: int = 1,
     ):
-        """Pooling operation for 2D spatial data.
-
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
+        """Initialize the sampling configs
+
+        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
+        :type do_sample: bool, optional
+        :param temperature: The temperature setting, defaults to 0.9
+        :type temperature: float, optional
+        :param topp: The top probabilities (top-p) setting, defaults to 0.8
+        :type topp: float, optional
+        :param topk: The top-k setting, defaults to 1
+        :type topk: int, optional
+        """
+        self.do_sample = do_sample
+        self.temperature = temperature
+        self.topp = topp
+        self.topk = topk
 
-        .. math::
-          C_{out} = out\_channels
 
-        .. math::
-          K_{H} = kernel\_h
+# -----------------------------------------------------------------------
+# GenerationResult
+# -----------------------------------------------------------------------
 
-        .. math::
-          K_{W} = kernel\_w
 
-        .. math::
-          S_{H} = stride\_h
+class GenerationResult(object):
+    """A class to store the output of a generation request."""
 
-        .. math::
-          S_{W} = stride\_w
+    def __init__(self, text: str = None, tokens: list = None):
+        self.output_text = text
+        self.output_tokens = tokens
 
-        .. math::
-          P_{H} = padding\_h
 
-        .. math::
-          P_{S} = padding\_s
+# -----------------------------------------------------------------------
+# LoraLinearConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
+class LoraLinearConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def __init__(
+        self,
+        cache_folder,
+        peft_model_id,
+    ):
+        c_cache_folder = get_c_name(cache_folder)
+        peft_model_id = get_c_name(peft_model_id)
+        self.handle = ffc().flexflow_lora_linear_config_create(
+            c_cache_folder,
+            peft_model_id,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy)
 
-        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
-        :type kernel_h: int
 
-        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
-        :type kernel_w: int
+# -----------------------------------------------------------------------
+# PEFTModelID
+# -----------------------------------------------------------------------
 
-        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
-        :type stride_h: int
 
-        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
-        :type stride_w: int
+class PEFTModelID(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    def __init__(self, id=None):
+        if id is None:
+            self.handle = ffc().flexflow_peft_model_id_create()
+        else:
+            self.handle = ffc().flexflow_peft_model_id_create_id(id)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
 
-        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
-        :type activation: PoolType
+# -----------------------------------------------------------------------
+# Request
+# -----------------------------------------------------------------------
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+
+class Request:
+    """A class to record the metadata of an inference or finetuning request."""
+
+    def __init__(
+        self,
+        req_type: RequestType,
+        prompt: str = None,
+        max_sequence_length: int = 128,
+        peft_model_id: PEFTModelID = None,
+        dataset_filepath: str = None,
+        max_training_steps: int = 1,
+    ):
+        self.req_type = req_type
+        self.prompt = prompt
+        self.max_sequence_length = max_sequence_length
+        self.peft_model_id = peft_model_id
+        self.dataset_filepath = dataset_filepath
+        self.max_training_steps = max_training_steps
+
+
+# -----------------------------------------------------------------------
+# FFModel
+# -----------------------------------------------------------------------
+
+
+class FFModel(object):
+    """ """
+
+    __slots__ = [
+        "handle",
+        "_handle",
+        "_layers",
+        "_nb_layers",
+        "_ffconfig",
+        "_tracing_id",
+        "initializers",
+        "attr_tensors",
+    ]
+
+    def __init__(self, ffconfig):
+        """Constructor of FFModel.
+
+        :param ffconfig: configurations of FlexFlow and the created model.
+        :type ffconfig: FFConfig
+
+        :returns:  FFModel -- the model.
+        """
+        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
+        self._layers = dict()
+        self._nb_layers = 0
+        self._ffconfig = ffconfig
+        global ff_tracing_id
+        self._tracing_id = ff_tracing_id
+        ff_tracing_id += 1
+        self.initializers = {}
+        self.attr_tensors = {}
+
+    def get_layers(self):
+        return self._layers
+
+    def add_layer(self, op_type, name):
+        layer_id = self._nb_layers
+        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
+        self._layers[self._nb_layers] = convert_op_handle_to_op(
+            op_type, op_handle, idx=layer_id, name=name
+        )
+        self._nb_layers += 1
+
+    def create_tensor(self, dims, data_type, create_grad=True):
+        """Instantiate a FlexFlow tensor.
+
+        :param x: a shape tuple/list (integers), including the batch size.
+        :type x: list of int
+
+        :param data_type: the datatype of the created tensor. Options are
+          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
+        :type data_type: DataType
+
+        :param create_grad: weather the tensor creates a gradients vector.
+          If you don't specify anything, a gradients vector is used.
+        :type create_grad: bool
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_tensor_create(
+            self.handle, num_dims, c_dims, c_data_type, create_grad
+        )
+        return Tensor(handle)
+
+    def map_tensor(self, tensor, parallel_op=None):
+        op_handle = self.__get_op_handle(parallel_op)
+        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
+
+    def create_constant(self, dims, value, data_type):
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_constant_create(
+            self.handle, num_dims, c_dims, value, c_data_type
+        )
+        return Tensor(handle)
+
+    def exp(self, x, name=None):
+        """Exponential activation function.
+
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -1859,34 +1892,31 @@ def pool2d(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_pool_type = enum_to_int(PoolType, pool_type)
-        c_activation = enum_to_int(ActiMode, activation)
-        handle = ffc().flexflow_model_add_pool2d(
-            self.handle,
-            input.handle,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_pool_type,
-            c_activation,
-            c_name,
-        )
-        self.add_layer(OpType.POOL2D, name)
-        return Tensor(handle, owner_op_type=OpType.POOL2D)
+        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
+        self.add_layer(OpType.EXP, name)
+        return Tensor(handle, owner_op_type=OpType.EXP)
 
-    def batch_norm(self, input, relu=True, name=None):
-        """Layer that normalizes its inputs.
+    def sin(self, x, name=None):
+        """Elementwise sine function.
 
-        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
+        :param x: the input Tensor.
+        :type x: Tensor
 
-        :param input: the list of input Tensors.
-        :type input: Tensor
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param relu: whether a ReLU function is applied. Default is True.
-        :type relu: bool
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
+        self.add_layer(OpType.SIN, name)
+        return Tensor(handle, owner_op_type=OpType.SIN)
+
+    def cos(self, x, name=None):
+        """Elementwise cosine function.
+
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -1894,253 +1924,102 @@ def batch_norm(self, input, relu=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_batch_norm(
-            self.handle, input.handle, relu, c_name
-        )
-        self.add_layer(OpType.BATCH_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
+        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
+        self.add_layer(OpType.COS, name)
+        return Tensor(handle, owner_op_type=OpType.COS)
 
-    def layer_norm(
-        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
-    ):
-        """Add a LayerNorm layer
+    def add(self, x, y, inplace_a=False, name=None):
+        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: The LayerNorm output tensor
-        :rtype: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
+
+        :param y: the second input Tensor.
+        :type y: Tensor
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_layer_norm(
-            self.handle,
-            input.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_add(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.LAYER_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
+        self.add_layer(OpType.ADD, name)
+        return Tensor(handle, owner_op_type=OpType.ADD)
 
-    def residual_layer_norm(
-        self,
-        input,
-        residual1,
-        residual2,
-        use_two_residuals,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        inplace_residual=False,
-        name=None,
-    ):
-        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in 
-        better efficiency compared to using separate element-wise add and LayerNorm operators.
+    def subtract(self, x, y, inplace_a=False, name=None):
+        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual1: The residual tensor to add to the input before computing the LayerNorm
-        :type residual1: Tensor
-        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
-        :type residual2: Tensor
-        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
-        :type use_two_residuals: bool
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: List[int]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
-        :type inplace_residual: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: str, optional
-        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        residual2_handle = (
-            residual1.handle
-        )  # This is intentional. Data will be ignored, and we cannot pass None
-        if use_two_residuals:
-            assert residual2 is not None
-            residual2_handle = residual2.handle
-        handles_array = ffc().flexflow_model_add_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual1.handle,
-            residual2_handle,
-            use_two_residuals,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            inplace_residual,
-            c_name,
-        )
-        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM)
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-    def add_bias_residual_layer_norm(
-        self,
-        input,
-        residual,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        inplace_residual=False,
-        name=None,
-    ):
-        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, 
-        resulting in better efficiency compared to using separate attention bias addition + 
-        element-wise residual addition + LayerNorm operators.
+        :param y: the second input Tensor.
+        :type y: Tensor
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual: The residual tensor
-        :type residual: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
-        :type inplace_residual: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            inplace_residual,
-            c_name,
-        )
-        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def sigmoid_silu_multi(self, input1, input2, name=None):
+        :returns:  Tensor -- the output tensor.
+        """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
-            self.handle, input1.handle, input2.handle, c_name
+        handle = ffc().flexflow_model_add_subtract(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
-
-    def batch_matmul(
-        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
-    ):
-        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
-
-        :param A: the first input Tensor.
-        :type A: Tensor
+        self.add_layer(OpType.SUBTRACT, name)
+        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
 
-        :param B: the second input Tensor.
-        :type B: Tensor
+    def multiply(self, x, y, inplace_a=False, name=None):
+        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
 
-        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
-        :type a_seq_length_dim: int
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
-        :type b_seq_length_dim: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :param name:  Whether to add use bias in layer normalization
-        :type name: bool
-
         :returns:  Tensor -- the output tensor.
         """
-        if a_seq_length_dim is None:
-            a_seq_length_dim = -1
-        if b_seq_length_dim is None:
-            b_seq_length_dim = -1
-        handle = ffc().flexflow_model_add_batch_matmul(
-            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_multiply(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.BATCH_MATMUL, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
-
-    def dense(
-        self,
-        input,
-        out_dim,
-        activation=ActiMode.AC_MODE_NONE,
-        use_bias=True,
-        datatype=DataType.DT_NONE,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        kernel_regularizer=None,
-        name=None,
-    ):
-        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
-        :attr:`activation` is the element-wise activation function passed as the activation argument,
-        :attr:`kernel` is a weights matrix created by the layer, and
-        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
-
-        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
-        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
+        self.add_layer(OpType.MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def divide(self, x, y, inplace_a=False, name=None):
+        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
 
-        :param out\_dim: dimensionality of the output space.
-        :type out\_dim: int
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+        :param y: the second input Tensor.
+        :type y: Tensor
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_divide(
+            self.handle, x.handle, y.handle, inplace_a, c_name
+        )
+        self.add_layer(OpType.DIVIDE, name)
+        return Tensor(handle, owner_op_type=OpType.DIVIDE)
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def max(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param kernel_regularizer: Regularizer for the kernel weights matrix
-        :type bias_initializer: Regularizer
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2148,109 +2027,57 @@ def dense(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        c_datatype = enum_to_int(DataType, datatype)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        if kernel_regularizer:
-            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
-            kernel_reg_lambda = kernel_regularizer._lambda
-        else:
-            c_kernel_reg_type = enum_to_int(
-                RegularizerMode, RegularizerMode.REG_MODE_NONE
-            )
-            kernel_reg_lambda = 0.0
-        handle = ffc().flexflow_model_add_dense(
-            self.handle,
-            input.handle,
-            out_dim,
-            c_activation,
-            use_bias,
-            c_datatype,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_kernel_reg_type,
-            kernel_reg_lambda,
-            c_name,
+        handle = ffc().flexflow_model_add_max(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.LINEAR, name)
-        return Tensor(handle, owner_op_type=OpType.LINEAR)
-
-    def concat(self, tensors, axis, name=None):
-        """Layer that concatenates a list of inputs.
+        self.add_layer(OpType.MAX, name)
+        return Tensor(handle, owner_op_type=OpType.MAX)
 
-        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
+    def min(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
 
-        :param input: the list of input Tensors.
-        :type input: List of Tensors
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param axis: the dimension along which to concatenate.
-        :type axis: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
-        assert type(tensors) is list, "tensors should be a list"
-        tensor_handle_list = []
-        n = len(tensors)
-        assert n <= 256, "Please increase MAX_NUM_INPUTS"
-        for tensor in tensors:
-            tensor_handle_list.append(tensor.handle)
-        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_concat(
-            self.handle, n, c_tensor_handle_list, axis, c_name
+        handle = ffc().flexflow_model_add_min(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.CONCAT, name)
-        return Tensor(handle, owner_op_type=OpType.CONCAT)
+        self.add_layer(OpType.MIN, name)
+        return Tensor(handle, owner_op_type=OpType.MIN)
 
-    def split(self, input, sizes, axis, name=None):
-        """Layer that splits a :attr:`input` tensor into a list of tensors.
+    def reduce_sum(self, input, axes, keepdims=False, name=None):
+        """Layer that computes the sum of the input Tensor along given axes.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
-        :type sizes: int or list of int
-
-        :param axis: the dimension along which to split.
-        :type axis: int
+        :param axes: the axes along which reduction is applied
+        :type axes: List[int]
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns:  list of Tensors -- the output tensors.
+        :returns:  Tensor -- the output tensor.
         """
-        if type(sizes) is list:
-            split = sizes
-        else:
-            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
-            split = [input.dims[axis] // sizes for i in range(sizes)]
-        n = len(split)
-        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
-        c_split = ffi.new("int[]", split)
-        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
         c_name = get_c_name(name)
-        ffc().flexflow_model_add_split(
-            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_reduce_sum(
+            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
         )
-        output_tensor_list = []
-        for i in range(n):
-            tensor_p_handle = ffi.new("flexflow_tensor_t*")
-            tensor_p_handle.impl = c_outputs_handle_list[i].impl
-            output_tensor_list.append(
-                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
-            )
-        self.add_layer(OpType.SPLIT, name)
-        del c_outputs_handle_list
-        return output_tensor_list
+        self.add_layer(OpType.REDUCE_SUM, name)
+        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def flat(self, input, name=None):
-        """Flattens the input. Does not affect the batch size.
+    def rsqrt(self, input, name=None):
+        """Layer that computes the element-wise reciprocal square-root.
 
         :param input: the input Tensor.
         :type input: Tensor
@@ -2261,211 +2088,216 @@ def flat(self, input, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
-        self.add_layer(OpType.FLAT, name)
-        return Tensor(handle, owner_op_type=OpType.FLAT)
+        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
+        self.add_layer(OpType.RSQRT, name)
+        return Tensor(handle, owner_op_type=OpType.RSQRT)
 
-    def softmax(self, input, axis=-1, name=None):
-        """Softmax activation function.
+    def pow(self, input, exponent, name=None):
+        """Layer that computes the element-wise power.
 
         :param input: the input Tensor.
         :type input: Tensor
 
+        :param exponent: exponent to raise each element in the input tensor.
+        :type exponent: float
+
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_softmax(
-            self.handle, input.handle, axis, c_name
+        handle = ffc().flexflow_model_add_pow(
+            self.handle, input.handle, exponent, c_name
         )
-        self.add_layer(OpType.SOFTMAX, name)
-        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
-
-    def reshape(self, input, shape, name=None):
-        """Layer that reshapes inputs into the given shape.
+        self.add_layer(OpType.POW, name)
+        return Tensor(handle, owner_op_type=OpType.POW)
 
-        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
-        except with a new shape given by :attr:`shape`.
+    def mean(self, input, dims, keepdims=False, name=None):
+        """Layer that computes the mean of the input tensor across the given
+        dimensions.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param shape: A list defining the shape of the output tensor.
-        :type shape: list of int
+        :param dims: dimensions to take the mean over.
+        :type dims: list
+
+        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
+                         collapses the dimension if False. Default is False.
+        :type keepdims: bool
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        dims = list(dims)
+        c_dims = ffi.new("int[]", dims)
         c_name = get_c_name(name)
-        c_shape = ffi.new("int[]", shape)
-        handle = ffc().flexflow_model_add_reshape(
-            self.handle, input.handle, len(shape), c_shape, c_name
+        handle = ffc().flexflow_model_add_mean(
+            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
         )
-        self.add_layer(OpType.RESHAPE, name)
-        return Tensor(handle, owner_op_type=OpType.RESHAPE)
-
-    def gather(self, input, index, dim, name=None):
-        """Layer that gathers values along the dim axis.
-
-        :param input: the input tensor
-        :type input: Tensor
+        self.add_layer(OpType.MEAN, name)
+        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-        :param index: the index tensor, which specifies the indices of elements to gather
-        :type index: Tensor
+    def conv2d(
+        self,
+        input,
+        out_channels,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        activation=ActiMode.AC_MODE_NONE,
+        groups=1,
+        use_bias=True,
+        shared_op=None,
+        kernel_initializer=None,
+        bias_initializer=None,
+        name=None,
+    ):
+        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
+        to produce a tensor of :attr:`output`.
 
-        :param dim: the axis along which to index
-        :type dim: int
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param name: the name of the layer. Default is None
-        :type name: string
+        .. math::
+          C_{out} = out\_channels
 
-        :returns: Tensor -- the output tensor
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gather(
-            self.handle, input.handle, index.handle, dim, c_name
-        )
-        self.add_layer(OpType.GATHER, name)
-        return Tensor(handle, owner_op_type=OpType.GATHER)
+        .. math::
+          K_{H} = kernel\_h
 
-    def transpose(self, input, perm, name=None):
-        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
+        .. math::
+          K_{W} = kernel\_w
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        .. math::
+          S_{H} = stride\_h
 
-        :param perm: A permutation of the dimensions of a.
-        :type perm: List of int
+        .. math::
+          S_{W} = stride\_w
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          P_{H} = padding\_h
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_perm = ffi.new("int[]", perm)
-        handle = ffc().flexflow_model_add_transpose(
-            self.handle, input.handle, len(perm), c_perm, c_name
-        )
-        self.add_layer(OpType.TRANSPOSE, name)
-        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
+        .. math::
+          P_{S} = padding\_s
 
-    def reverse(self, input, axis, name=None):
-        """Layer that reverses specific dimensions of a tensor.
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param axis: the dimension to reverse.
-        :type axis: int
+        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
+        :type out\_channels: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
+        :type kernel_h: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_reverse(
-            self.handle, input.handle, axis, c_name
-        )
-        self.add_layer(OpType.REVERSE, name)
-        return Tensor(handle, owner_op_type=OpType.REVERSE)
+        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
+        :type kernel_w: int
 
-    def scalar_multiply(self, input, scalar, inplace=True, name=None):
-        """Scalar multiplication of a tensor by an scalar.
+        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :param input: the scalar
-        :type scalar: float
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_multiply(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-    def scalar_add(self, input, scalar, inplace=True, name=None):
-        """Scalar addition of a scalar to each entry of a tensor.
+        :param groups: the number of groups in this convolution
+        :type groups: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
 
-        :param input: the scalar
-        :type scalar: float
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
+
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_add(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_ADD, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
-
-    def scalar_sub(self, input, scalar, inplace=True, name=None):
-        """Scalar subtraction of a scalar to each entry of a tensor.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param input: the scalar
-        :type scalar: float
-
-        :param name: the name of the layer. Default is None.
-        :type name: string
-
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_sub(
-            self.handle, input.handle, scalar, inplace, c_name
+        handle = ffc().flexflow_model_add_conv2d(
+            self.handle,
+            input.handle,
+            out_channels,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_activation,
+            groups,
+            use_bias,
+            shared_op_handle,
+            kernel_init_handle,
+            bias_init_handle,
+            c_name,
         )
-        self.add_layer(OpType.SCALAR_SUB, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
+        self.add_layer(OpType.CONV2D, name)
+        return Tensor(handle, owner_op_type=OpType.CONV2D)
 
-    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
-        """Scalar regular division of a tensor by an scalar.
+    def embedding(
+        self,
+        input,
+        num_embeddings,
+        embedding_dim,
+        aggr,
+        dtype=DataType.DT_FLOAT,
+        shared_op=None,
+        kernel_initializer=None,
+        name=None,
+    ):
+        """Layer that turns positive integers into dense vectors of fixed size
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
+        :type num_embeddings: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param embedding_dim: dimension of the dense embedding.
+        :type embedding_dim: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_truediv(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_TRUEDIV, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
+        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
+        :type aggr: AggrMode
 
-    def gelu(self, input, inplace=True, name=None):
-        """Gaussian Error Linear Unit activation function.
+        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
+        :type dtype: DataType
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
+
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2473,81 +2305,105 @@ def gelu(self, input, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
-        self.add_layer(OpType.GELU, name)
-        return Tensor(handle, owner_op_type=OpType.GELU)
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_aggr = enum_to_int(AggrMode, aggr)
+        c_dtype = enum_to_int(DataType, dtype)
+        if kernel_initializer is None:
+            kernel_initializer = GlorotUniformInitializer(42)
+        assert (
+            (type(kernel_initializer) is GlorotUniformInitializer)
+            or (type(kernel_initializer) is ZeroInitializer)
+            or (type(kernel_initializer) is UniformInitializer)
+            or (type(kernel_initializer) is NormInitializer)
+        ), f"Unknown initializer type: {kernel_initializer}"
+        handle = ffc().flexflow_model_add_embedding(
+            self.handle,
+            input.handle,
+            num_embeddings,
+            embedding_dim,
+            c_aggr,
+            c_dtype,
+            shared_op_handle,
+            kernel_initializer.handle,
+            c_name,
+        )
+        # NOTE: We must keep a reference to the initializer or else it will be
+        # immediately destructed
+        self.initializers[name] = kernel_initializer
+        self.add_layer(OpType.EMBEDDING, name)
+        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
 
-    def relu(self, input, inplace=True, name=None):
-        """Rectified Linear Unit activation function.
+    def pool2d(
+        self,
+        input,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        pool_type=PoolType.POOL_MAX,
+        activation=ActiMode.AC_MODE_NONE,
+        name=None,
+    ):
+        """Pooling operation for 2D spatial data.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          C_{out} = out\_channels
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_relu(
-            self.handle, input.handle, inplace, c_name
-        )
-        self.add_layer(OpType.RELU, name)
-        return Tensor(handle, owner_op_type=OpType.RELU)
+        .. math::
+          K_{H} = kernel\_h
 
-    def identity(self, input, name=None):
-        """Identity function.
+        .. math::
+          K_{W} = kernel\_w
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        .. math::
+          S_{H} = stride\_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          S_{W} = stride\_w
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
-        self.add_layer(OpType.IDENTITY, name)
-        return Tensor(handle, owner_op_type=OpType.IDENTITY)
+        .. math::
+          P_{H} = padding\_h
 
-    def sigmoid(self, input, name=None):
-        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
+        .. math::
+          P_{S} = padding\_s
+
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
+        :type kernel_h: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
-        self.add_layer(OpType.SIGMOID, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID)
+        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
+        :type kernel_w: int
 
-    def tanh(self, input, name=None):
-        """Hyperbolic tangent activation function.
+        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
-        self.add_layer(OpType.TANH, name)
-        return Tensor(handle, owner_op_type=OpType.TANH)
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
 
-    def elu(self, input, inplace=True, name=None):
-        """Exponential Linear Unit. activation function.
+        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
+        :type activation: PoolType
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2555,27 +2411,34 @@ def elu(self, input, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_elu(
-            self.handle, input.handle, inplace, c_name
+        c_pool_type = enum_to_int(PoolType, pool_type)
+        c_activation = enum_to_int(ActiMode, activation)
+        handle = ffc().flexflow_model_add_pool2d(
+            self.handle,
+            input.handle,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_pool_type,
+            c_activation,
+            c_name,
         )
-        self.add_layer(OpType.ELU, name)
-        return Tensor(handle, owner_op_type=OpType.ELU)
+        self.add_layer(OpType.POOL2D, name)
+        return Tensor(handle, owner_op_type=OpType.POOL2D)
 
-    def dropout(self, input, rate, seed, name=None):
-        """The Dropout layer randomly sets input units to 0 with
-        a frequency of :attr:`rate` at each step during training time,
-        which helps prevent overfitting.
-        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
-        sum over all inputs is unchanged.
+    def batch_norm(self, input, relu=True, name=None):
+        """Layer that normalizes its inputs.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
 
-        :param rate: Fraction of the input units to drop.
-        :type rate: float(0-1)
+        :param input: the list of input Tensors.
+        :type input: Tensor
 
-        :param seed: random seed.
-        :type seed: int
+        :param relu: whether a ReLU function is applied. Default is True.
+        :type relu: bool
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2583,371 +2446,253 @@ def dropout(self, input, rate, seed, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_dropout(
-            self.handle, input.handle, rate, seed, c_name
+        handle = ffc().flexflow_model_add_batch_norm(
+            self.handle, input.handle, relu, c_name
         )
-        self.add_layer(OpType.DROPOUT, name)
-        return Tensor(handle, owner_op_type=OpType.DROPOUT)
+        self.add_layer(OpType.BATCH_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
 
-    def multihead_attention(
-        self,
-        query,
-        key,
-        value,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        kernel_initializer=None,
-        name=None,
+    def layer_norm(
+        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
-        and returns the dot-product attention between them:.
-
-        :param query: the query Tensor.
-        :type query: Tensor
-
-        :param key: the key Tensor.
-        :type key: Tensor
-
-        :param value: the value Tensor.
-        :type value: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        """Add a LayerNorm layer
 
-        :returns:  Tensor -- the output tensor.
+        :param input: The input tensor
+        :type input: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: The LayerNorm output tensor
+        :rtype: Tensor
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        handle = ffc().flexflow_model_add_multihead_attention(
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_layer_norm(
             self.handle,
-            query.handle,
-            key.handle,
-            value.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            kernel_init_handle,
+            input.handle,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
             c_name,
         )
-        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.LAYER_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
 
-    def inc_multihead_self_attention(
+    def residual_layer_norm(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        residual1,
+        residual2,
+        use_two_residuals,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in
+        better efficiency compared to using separate element-wise add and LayerNorm operators.
 
-        :param input: the input Tensor.
+        :param input: The input tensor
         :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
-
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
-
-        :param name: the name of the layer. Default is None.
-        :type name: string
-
-        :returns:  Tensor -- the output tensor.
+        :param residual1: The residual tensor to add to the input before computing the LayerNorm
+        :type residual1: Tensor
+        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
+        :type residual2: Tensor
+        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
+        :type use_two_residuals: bool
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: List[int]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: str, optional
+        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+        c_axes = ffi.new("int[]", axes)
+        residual2_handle = (
+            residual1.handle
+        )  # This is intentional. Data will be ignored, and we cannot pass None
+        if use_two_residuals:
+            assert residual2 is not None
+            residual2_handle = residual2.handle
+        handles_array = ffc().flexflow_model_add_residual_layer_norm(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            residual1.handle,
+            residual2_handle,
+            use_two_residuals,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
             c_name,
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
+        return Tensor(
+            handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM
+        ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM)
 
-    def spec_inc_multihead_self_attention(
+    def add_bias_residual_layer_norm(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        residual,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
+        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel,
+        resulting in better efficiency compared to using separate attention bias addition +
+        element-wise residual addition + LayerNorm operators.
 
-        :param input: the input Tensor.
+        :param input: The input tensor
         :type input: Tensor
+        :param residual: The residual tensor
+        :type residual: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual.handle,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
+        return Tensor(
+            handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
+        ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def sigmoid_silu_multi(self, input1, input2, name=None):
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
+            self.handle, input1.handle, input2.handle, c_name
+        )
+        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+    def batch_matmul(
+        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
+    ):
+        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param A: the first input Tensor.
+        :type A: Tensor
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :param B: the second input Tensor.
+        :type B: Tensor
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
+        :type a_seq_length_dim: int
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
+        :type b_seq_length_dim: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
+        :param name:  Whether to add use bias in layer normalization
+        :type name: bool
+
         :returns:  Tensor -- the output tensor.
         """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        if a_seq_length_dim is None:
+            a_seq_length_dim = -1
+        if b_seq_length_dim is None:
+            b_seq_length_dim = -1
+        handle = ffc().flexflow_model_add_batch_matmul(
+            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.BATCH_MATMUL, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
 
-    def inc_multihead_self_attention_verify(
+    def dense(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
+        out_dim,
+        activation=ActiMode.AC_MODE_NONE,
+        use_bias=True,
+        datatype=DataType.DT_NONE,
+        shared_op=None,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        bias_initializer=None,
+        kernel_regularizer=None,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
+        :attr:`activation` is the element-wise activation function passed as the activation argument,
+        :attr:`kernel` is a weights matrix created by the layer, and
+        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
+
+        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
+        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param out\_dim: dimensionality of the output space.
+        :type out\_dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param kernel_regularizer: Regularizer for the kernel weights matrix
+        :type bias_initializer: Regularizer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2955,106 +2700,128 @@ def inc_multihead_self_attention_verify(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        c_datatype = enum_to_int(DataType, datatype)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
+        if kernel_regularizer:
+            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
+            kernel_reg_lambda = kernel_regularizer._lambda
+        else:
+            c_kernel_reg_type = enum_to_int(
+                RegularizerMode, RegularizerMode.REG_MODE_NONE
+            )
+            kernel_reg_lambda = 0.0
+        handle = ffc().flexflow_model_add_dense(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            out_dim,
+            c_activation,
+            use_bias,
+            c_datatype,
+            shared_op_handle,
+            kernel_init_handle,
+            bias_init_handle,
+            c_kernel_reg_type,
+            kernel_reg_lambda,
             c_name,
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.LINEAR, name)
+        return Tensor(handle, owner_op_type=OpType.LINEAR)
 
-    def inc_multiquery_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+    def concat(self, tensors, axis, name=None):
+        """Layer that concatenates a list of inputs.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param input: the list of input Tensors.
+        :type input: List of Tensors
 
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
+        :param axis: the dimension along which to concatenate.
+        :type axis: int
 
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :returns:  Tensor -- the output tensor.
+        """
+        assert type(tensors) is list, "tensors should be a list"
+        tensor_handle_list = []
+        n = len(tensors)
+        assert n <= 256, "Please increase MAX_NUM_INPUTS"
+        for tensor in tensors:
+            tensor_handle_list.append(tensor.handle)
+        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_concat(
+            self.handle, n, c_tensor_handle_list, axis, c_name
+        )
+        self.add_layer(OpType.CONCAT, name)
+        return Tensor(handle, owner_op_type=OpType.CONCAT)
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+    def split(self, input, sizes, axis, name=None):
+        """Layer that splits a :attr:`input` tensor into a list of tensors.
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
+        :type sizes: int or list of int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param axis: the dimension along which to split.
+        :type axis: int
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :returns:  list of Tensors -- the output tensors.
+        """
+        if type(sizes) is list:
+            split = sizes
+        else:
+            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
+            split = [input.dims[axis] // sizes for i in range(sizes)]
+        n = len(split)
+        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
+        c_split = ffi.new("int[]", split)
+        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
+        c_name = get_c_name(name)
+        ffc().flexflow_model_add_split(
+            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
+        )
+        output_tensor_list = []
+        for i in range(n):
+            tensor_p_handle = ffi.new("flexflow_tensor_t*")
+            tensor_p_handle.impl = c_outputs_handle_list[i].impl
+            output_tensor_list.append(
+                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
+            )
+        self.add_layer(OpType.SPLIT, name)
+        del c_outputs_handle_list
+        return output_tensor_list
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def flat(self, input, name=None):
+        """Flattens the input. Does not affect the batch size.
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
+        self.add_layer(OpType.FLAT, name)
+        return Tensor(handle, owner_op_type=OpType.FLAT)
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+    def softmax(self, input, axis=-1, name=None):
+        """Softmax activation function.
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3062,107 +2829,69 @@ def inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_softmax(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.SOFTMAX, name)
+        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
 
-    def spec_inc_multiquery_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
+    def reshape(self, input, shape, name=None):
+        """Layer that reshapes inputs into the given shape.
+
+        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
+        except with a new shape given by :attr:`shape`.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param shape: A list defining the shape of the output tensor.
+        :type shape: list of int
 
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        c_shape = ffi.new("int[]", shape)
+        handle = ffc().flexflow_model_add_reshape(
+            self.handle, input.handle, len(shape), c_shape, c_name
+        )
+        self.add_layer(OpType.RESHAPE, name)
+        return Tensor(handle, owner_op_type=OpType.RESHAPE)
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+    def gather(self, input, index, dim, name=None):
+        """Layer that gathers values along the dim axis.
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param input: the input tensor
+        :type input: Tensor
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param index: the index tensor, which specifies the indices of elements to gather
+        :type index: Tensor
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param dim: the axis along which to index
+        :type dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param name: the name of the layer. Default is None
+        :type name: string
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :returns: Tensor -- the output tensor
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_gather(
+            self.handle, input.handle, index.handle, dim, c_name
+        )
+        self.add_layer(OpType.GATHER, name)
+        return Tensor(handle, owner_op_type=OpType.GATHER)
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+    def transpose(self, input, perm, name=None):
+        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param perm: A permutation of the dimensions of a.
+        :type perm: List of int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3170,107 +2899,23 @@ def spec_inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        c_perm = ffi.new("int[]", perm)
+        handle = ffc().flexflow_model_add_transpose(
+            self.handle, input.handle, len(perm), c_perm, c_name
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.TRANSPOSE, name)
+        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
 
-    def inc_multiquery_self_attention_verify(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+    def reverse(self, input, axis, name=None):
+        """Layer that reverses specific dimensions of a tensor.
+
+        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
-
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
-
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param axis: the dimension to reverse.
+        :type axis: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3278,43 +2923,20 @@ def inc_multiquery_self_attention_verify(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_reverse(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.REVERSE, name)
+        return Tensor(handle, owner_op_type=OpType.REVERSE)
 
-    def rms_norm(self, input, eps, dim, name=None):
-        """Defines the RMS Norm layer.
+    def scalar_multiply(self, input, scalar, inplace=True, name=None):
+        """Scalar multiplication of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3322,58 +2944,41 @@ def rms_norm(self, input, eps, dim, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rms_norm(
-            self.handle, input.handle, eps, dim, c_name
+        handle = ffc().flexflow_model_add_scalar_multiply(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.RMS_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
-
-    def residual_rms_norm(self, input1, input2, eps, dim, inplace_residual=False, name=None):
-        """Defines the Residual RMS Norm layer.
+        self.add_layer(OpType.SCALAR_MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
 
-        :param input: the input 1 Tensor.
-        :type input: Tensor
+    def scalar_add(self, input, scalar, inplace=True, name=None):
+        """Scalar addition of a scalar to each entry of a tensor.
 
-        :param input: the input 2 Tensor.
+        :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
-        :type inplace_residual: bool
-
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handles_array = ffc().flexflow_model_add_residual_rms_norm(
-            self.handle, input1.handle, input2.handle, eps, dim, inplace_residual, c_name
-        )
-        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
-        return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
-            handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM
+        handle = ffc().flexflow_model_add_scalar_add(
+            self.handle, input.handle, scalar, inplace, c_name
         )
+        self.add_layer(OpType.SCALAR_ADD, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
 
-    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
-        """Defines the Arg TopK layer.
+    def scalar_sub(self, input, scalar, inplace=True, name=None):
+        """Scalar subtraction of a scalar to each entry of a tensor.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param k: the top k indices to select
-        :type k: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
-
-        :param speculative_decoding: Whether you need to perform beam search
-        :type speculative_decoding: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3381,23 +2986,20 @@ def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_arg_top_k(
-            self.handle, input.handle, k, sorted, c_name
+        handle = ffc().flexflow_model_add_scalar_sub(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.ARG_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
+        self.add_layer(OpType.SCALAR_SUB, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
 
-    def beam_top_k(self, input, max_beam_size, sorted, name=None):
-        """Defines the Beam TopK layer.
+    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
+        """Scalar regular division of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param max_beam_size: the top max_beam_size indices to select
-        :type max_beam_size: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3405,885 +3007,1470 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_beam_top_k(
-            self.handle, input.handle, max_beam_size, sorted, c_name
+        handle = ffc().flexflow_model_add_scalar_truediv(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.BEAM_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
+        self.add_layer(OpType.SCALAR_TRUEDIV, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
 
-    def sampling(self, input, top_p, name=None):
-        """Defines the Sampling layer.
+    def gelu(self, input, inplace=True, name=None):
+        """Gaussian Error Linear Unit activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param top_p: The top_p parameter of the sampling
-        :type top_p: float
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sampling(
-            self.handle, input.handle, top_p, c_name
-        )
-        self.add_layer(OpType.SAMPLING, name)
-        return Tensor(handle, owner_op_type=OpType.SAMPLING)
+        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
+        self.add_layer(OpType.GELU, name)
+        return Tensor(handle, owner_op_type=OpType.GELU)
 
-    def argmax(self, input, beam_search, name=None):
-        """Defines the Sampling layer.
+    def relu(self, input, inplace=True, name=None):
+        """Rectified Linear Unit activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param beam_search: Whether you need to perform beam search
-        :type beam_search: bool
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_argmax(
-            self.handle, input.handle, beam_search, c_name
+        handle = ffc().flexflow_model_add_relu(
+            self.handle, input.handle, inplace, c_name
         )
-        self.add_layer(OpType.ARGMAX, name)
-        return Tensor(handle, owner_op_type=OpType.ARGMAX)
+        self.add_layer(OpType.RELU, name)
+        return Tensor(handle, owner_op_type=OpType.RELU)
 
-    def reset_metrics(self):
-        """Reset performance metrics.
+    def identity(self, input, name=None):
+        """Identity function.
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_reset_metrics(self.handle)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def init_layers(self):
-        """Initialize layers.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_init_layers(self.handle)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
+        self.add_layer(OpType.IDENTITY, name)
+        return Tensor(handle, owner_op_type=OpType.IDENTITY)
 
-    def prefetch(self):
-        ffc().flexflow_model_prefetch(self.handle)
+    def sigmoid(self, input, name=None):
+        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
 
-    def forward(self, seq_length=None):
-        """Forward propagation of all layers.
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :returns:  None -- no returns.
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_forward(self.handle, seq_length)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
+        self.add_layer(OpType.SIGMOID, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID)
 
-    # TODO: seperate compute_metrics from backward
-    def backward(self, seq_length=None):
-        """Backward propagation of all layers.
+    def tanh(self, input, name=None):
+        """Hyperbolic tangent activation function.
 
-        :returns:  None -- no returns.
-        """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_backward(self.handle, seq_length)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def compute_metrics(self):
-        """Compute performance metrics.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_compute_metrics(self.handle)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
+        self.add_layer(OpType.TANH, name)
+        return Tensor(handle, owner_op_type=OpType.TANH)
 
-    def update(self):
-        """Update weights and biases of all layers.
+    def elu(self, input, inplace=True, name=None):
+        """Exponential Linear Unit. activation function.
 
-        :returns:  None -- no returns.
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_update(self.handle)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_elu(
+            self.handle, input.handle, inplace, c_name
+        )
+        self.add_layer(OpType.ELU, name)
+        return Tensor(handle, owner_op_type=OpType.ELU)
 
-    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
-        """Configure the model for trainting. FlexFlow uses lazy initialization,
-        so the actual creating of all operations (including creating and partitioning
-        of weight, bias and output tensors) happen during compile.
+    def dropout(self, input, rate, seed, name=None):
+        """The Dropout layer randomly sets input units to 0 with
+        a frequency of :attr:`rate` at each step during training time,
+        which helps prevent overfitting.
+        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
+        sum over all inputs is unchanged.
 
-        :param optimizer: optimizer instance.
-        :type optimizer: Optimizer
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param loss_type: Enum of LossType.
-          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
-        :type loss_type: LossType
+        :param rate: Fraction of the input units to drop.
+        :type rate: float(0-1)
 
-        :param metrics: List of metrics to be evaluated by the model during training and testing.
-          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
-          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
-        :type metrics: MetricsType
+        :param seed: random seed.
+        :type seed: int
 
-        :param comp_mode: Enum of CompMode.
-          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
-        :type comp_mode: CompMode
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        self.optimizer = optimizer
-
-        c_loss_type = enum_to_int(LossType, loss_type)
-        metrics_int = []
-        for metric in metrics:
-            metrics_int.append(enum_to_int(MetricsType, metric))
-        c_metrics = ffi.new("int[]", metrics_int)
-        if comp_mode == None:
-            comp_mode = CompMode.TRAINING
-        c_comp_mode = enum_to_int(CompMode, comp_mode)
-        ffc().flexflow_model_compile(
-            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_dropout(
+            self.handle, input.handle, rate, seed, c_name
         )
-        for ff_tensor, np_tensor in self.attr_tensors.items():
-            ff_tensor.set_tensor(self, np_tensor)
-        print("Compiled ffmodel!")
+        self.add_layer(OpType.DROPOUT, name)
+        return Tensor(handle, owner_op_type=OpType.DROPOUT)
 
-    def fit(self, x=None, y=None, batch_size=None, epochs=1):
-        """Trains the model for a fixed number of epochs (iterations on a dataset).
+    def multihead_attention(
+        self,
+        query,
+        key,
+        value,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kernel_initializer=None,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
+        and returns the dot-product attention between them:.
 
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
+        :param query: the query Tensor.
+        :type query: Tensor
 
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
+        :param key: the key Tensor.
+        :type key: Tensor
 
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
+        :param value: the value Tensor.
+        :type value: Tensor
 
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-        :returns:  None -- no returns.
-        """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for epoch in range(0, epochs):
-            for d in dataloaders:
-                d.reset()
-            self.reset_metrics()
-            iterations = num_samples / batch_size
-            for iter in range(0, int(iterations)):
-                self._ffconfig.begin_trace(self._tracing_id)
-                for d in dataloaders:
-                    d.next_batch(self)
-                self.forward()
-                self.zero_gradients()
-                self.backward()
-                self.update()
-                self._ffconfig.end_trace(self._tracing_id)
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def eval(self, x=None, y=None, batch_size=None):
-        """Returns the loss value & metrics values for the model in test mode.
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        :returns:  None -- no returns.
-        """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        for d in dataloaders:
-            d.reset()
-        self.reset_metrics()
-        iterations = num_samples / batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for iter in range(0, int(iterations)):
-            for d in dataloaders:
-                d.next_batch(self)
-            self._ffconfig.begin_trace(self._tracing_id)
-            self.forward()
-            self.compute_metrics()
-            self._ffconfig.end_trace(self._tracing_id)
-
-    def zero_gradients(self):
-        """Empty the gradients of all layers.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_zero_gradients(self.handle)
-
-    def set_optimizer(self, optimizer):
-        if isinstance(optimizer, SGDOptimizer) == True:
-            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
-        elif isinstance(optimizer, AdamOptimizer) == True:
-            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
-        elif optimizer == None:
-            pass
-        else:
-            assert 0, "[Model]: unknown optimizer"
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        handle = ffc().flexflow_model_add_multihead_attention(
+            self.handle,
+            query.handle,
+            key.handle,
+            value.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            kernel_init_handle,
+            c_name,
+        )
+        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
 
-    optimizer = property(fset=set_optimizer)
+    def inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
 
-    def print_layers(self, id=-1):
-        ffc().flexflow_model_print_layers(self.handle, id)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def get_layer_by_id(self, layer_id):
-        return self._layers[layer_id]
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-    def get_last_layer(self):
-        return self._layers[self._nb_layers - 1]
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-    def get_layer_by_name(self, layer_name):
-        for layer_id in self._layers:
-            layer = self._layers[layer_id]
-            if layer.name == layer_name:
-                return layer
-        assert 0, f"Cannot find the layer with name {layer_name}"
-        return None
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def get_tensor_by_id(self, id):
-        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
-        return Parameter(handle)
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-    @property
-    def label_tensor(self):
-        handle = ffc().flexflow_model_get_label_tensor(self.handle)
-        return Tensor(handle, deallocate=False)
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-    def get_perf_metrics(self):
-        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
-        return PerfMetrics(handle)
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-    def set_transformer_layer_id(self, id):
-        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-    def create_data_loader(self, batch_tensor, full_array):
-        """Create a SingleDataloader instance.
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
-        :type batch_tensor: Tensor
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
-        :param full_array: the entire data.
-        :type full_array: Numpy Array
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        :returns:  SingleDataloader -- returns a dataloader instance.
-        """
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-        if self._ffconfig.enable_control_replication:
-            assert (
-                self._ffconfig.python_data_loader_type != 1
-            ), "To enable control replication, please set --python-data-loader-type 2"
-            return self.__create_data_loader_ptr(batch_tensor, full_array)
-        else:
-            if self._ffconfig.python_data_loader_type == 1:
-                return self.__create_data_loader_attach(batch_tensor, full_array)
-            else:
-                return self.__create_data_loader_ptr(batch_tensor, full_array)
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-    def __create_data_loader_attach(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        num_dim = len(full_array_shape)
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-        if num_dim == 2:
-            full_tensor = self.create_tensor(
-                [num_samples, full_array_shape[1]], datatype
-            )
-            self.map_tensor(full_tensor)
-        elif num_dim == 4:
-            full_tensor = self.create_tensor(
-                [
-                    num_samples,
-                    full_array_shape[1],
-                    full_array_shape[2],
-                    full_array_shape[3],
-                ],
-                datatype,
-            )
-            self.map_tensor(full_tensor)
-        else:
-            assert 0, "unsupported dims"
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
-        full_tensor.attach_numpy_array(self._ffconfig, full_array)
-        dataloader = SingleDataLoader(
-            self, batch_tensor, full_tensor, num_samples, datatype
-        )
-        full_tensor.detach_numpy_array(self._ffconfig)
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-        return dataloader
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __create_data_loader_ptr(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
-        np_raw_ptr = full_array.__array_interface__["data"]
-        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-        print(
-            "numpy array: %s, %s, %s"
-            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
-        )
-        dataloader = SingleDataLoader(
-            self, batch_tensor, raw_ptr, num_samples, datatype
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
 
-        return dataloader
-
-    def __get_initializer_handle(self, initializer):
-        if initializer == None:
-            null_initializer = Initializer(None)
-            return null_initializer.handle
-        else:
-            return initializer.handle
-
-    def __get_op_handle(self, shared_op):
-        if shared_op == None:
-            op_handle = ffi.new("flexflow_op_t *")
-            op_handle.impl = ffi.NULL
-            op = Op(op_handle[0])
-        else:
-            op = shared_op
-        return op.handle
-
-    def get_output_tensor(self, ffmodel, data_type):
-        shape = self.dims
-        if data_type == DataType.DT_HALF:
-            np_array = np.empty(shape, dtype=np.float16)
-        elif data_type == DataType.DT_FLOAT:
-            np_array = np.empty(shape, dtype=np.float32)
-        elif self.data_type == DataType.DT_INT32:
-            np_array = np.empty(shape, dtype=np.int32)
-        elif self.data_type == DataType.DT_INT64:
-            np_array = np.empty(shape, dtype=np.int64)
-        else:
-            assert 0, f"Unsupported datatype: {self.data_type}"
-        np_raw_ptr = np_array.__array_interface__["data"]
-        if np_array.dtype == np.float32:
-            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_float(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int32:
-            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int64:
-            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int64(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        fflogger.debug(
-            "get weights raw_ptr: %s, %s, %s, %s"
-            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
+    def spec_inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multihead_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
+
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        assert ret_val == True
-        return np_array
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
 
-    def generate(self, prompt_list, max_sequence_length):
-        assert isinstance(prompt_list, list)
-        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
-        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
-        c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
-        ffc().flexflow_model_generate(
+    def spec_inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
+
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
             self.handle,
-            len(prompt_list),
-            c_input_texts,
-            max_num_chars,
-            c_output_texts,
-            max_sequence_length,
-            c_output_length_and_tokens,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multiquery_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
+
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        #output_length = c_output_length_and_tokens[0]
-        #output_tokens = []
-        #for i in range(output_length):
-        #    output_tokens.append(c_output_length_and_tokens[i + 1])
-        from flexflow.serve import GenerationResult
-
-        return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def set_position_offset(self, offset):
-        ffc().flexflow_model_set_position_offset(self.handle, offset)
+    def rms_norm(self, input, eps, dim, name=None):
+        """Defines the RMS Norm layer.
 
+        :param input: the input Tensor.
+        :type input: Tensor
 
-# -----------------------------------------------------------------------
-# SGDOptimizer
-# -----------------------------------------------------------------------
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
 
-class SGDOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(
-        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
-    ):
-        self.handle = ffc().flexflow_sgd_optimizer_create(
-            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_rms_norm(
+            self.handle, input.handle, eps, dim, c_name
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
+        self.add_layer(OpType.RMS_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
+    def residual_rms_norm(
+        self, input1, input2, eps, dim, inplace_residual=False, name=None
+    ):
+        """Defines the Residual RMS Norm layer.
 
+        :param input: the input 1 Tensor.
+        :type input: Tensor
 
-# -----------------------------------------------------------------------
-# AdamOptimizer
-# -----------------------------------------------------------------------
+        :param input: the input 2 Tensor.
+        :type input: Tensor
 
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
-class AdamOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
 
-    def __init__(
-        self,
-        ffmodel,
-        alpha=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        weight_decay=0.0,
-        epsilon=1e-8,
-    ):
-        self.handle = ffc().flexflow_adam_optimizer_create(
-            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
-        )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
+        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
+        :type inplace_residual: bool
 
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handles_array = ffc().flexflow_model_add_residual_rms_norm(
+            self.handle,
+            input1.handle,
+            input2.handle,
+            eps,
+            dim,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
+        return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
+            handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM
+        )
 
-# -----------------------------------------------------------------------
-# Initializer
-# -----------------------------------------------------------------------
-class Initializer(object):
-    __slots__ = ["handle", "p_handle"]
+    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
+        """Defines the Arg TopK layer.
 
-    def __init__(self, handle, p_handle=0):
-        self.p_handle = ffi.new("flexflow_initializer_t *")
-        if handle == None:
-            self.p_handle.impl = ffi.NULL
-        else:
-            self.p_handle.impl = handle.impl
-        self.handle = self.p_handle[0]
-        assert ffi.typeof(self.handle) == ffi.typeof(
-            "flexflow_initializer_t"
-        ), "Initializer handle is wrong"
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param k: the top k indices to select
+        :type k: int
 
-# -----------------------------------------------------------------------
-# GlorotUniform
-# -----------------------------------------------------------------------
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
+        :param speculative_decoding: Whether you need to perform beam search
+        :type speculative_decoding: bool
 
-class GlorotUniformInitializer(Initializer):
-    __slots__ = ["glorot_handle", "_glorot_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self, seed):
-        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
-        self._glorot_handle = ffi.gc(
-            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_arg_top_k(
+            self.handle, input.handle, k, sorted, c_name
         )
-        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
+        self.add_layer(OpType.ARG_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
 
+    def beam_top_k(self, input, max_beam_size, sorted, name=None):
+        """Defines the Beam TopK layer.
 
-# -----------------------------------------------------------------------
-# ZeroInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param max_beam_size: the top max_beam_size indices to select
+        :type max_beam_size: int
 
-class ZeroInitializer(Initializer):
-    __slots__ = ["zero_handle", "_zero_handle"]
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
-    def __init__(self):
-        self.zero_handle = ffc().flexflow_zero_initializer_create()
-        self._zero_handle = ffi.gc(
-            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_beam_top_k(
+            self.handle, input.handle, max_beam_size, sorted, c_name
         )
-        super(ZeroInitializer, self).__init__(self.zero_handle)
+        self.add_layer(OpType.BEAM_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
 
+    def sampling(self, input, top_p, name=None):
+        """Defines the Sampling layer.
 
-# -----------------------------------------------------------------------
-# UniformInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param top_p: The top_p parameter of the sampling
+        :type top_p: float
 
-class UniformInitializer(Initializer):
-    __slots__ = ["uniform_handle", "_uniform_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self, seed, minv, maxv):
-        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
-            seed, minv, maxv
-        )
-        self._uniform_handle = ffi.gc(
-            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sampling(
+            self.handle, input.handle, top_p, c_name
         )
-        super(UniformInitializer, self).__init__(self.uniform_handle)
+        self.add_layer(OpType.SAMPLING, name)
+        return Tensor(handle, owner_op_type=OpType.SAMPLING)
 
+    def argmax(self, input, beam_search, name=None):
+        """Defines the Sampling layer.
 
-# -----------------------------------------------------------------------
-# NormInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param beam_search: Whether you need to perform beam search
+        :type beam_search: bool
 
-class NormInitializer(Initializer):
-    __slots__ = ["norm_handle", "_norm_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self, seed, mean, stddev):
-        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
-        self._norm_handle = ffi.gc(
-            self.norm_handle, ffc().flexflow_norm_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_argmax(
+            self.handle, input.handle, beam_search, c_name
         )
-        super(NormInitializer, self).__init__(self.norm_handle)
+        self.add_layer(OpType.ARGMAX, name)
+        return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
+    def add_lora_layer(self, peft_config):
+        handle = ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle)
+        return handle
+        # self.add_layer(OpType.LORA, name)
 
-# -----------------------------------------------------------------------
-# PerfMetrics
-# -----------------------------------------------------------------------
+    def reset_metrics(self):
+        """Reset performance metrics.
+
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_reset_metrics(self.handle)
 
+    def init_layers(self):
+        """Initialize layers.
 
-class PerfMetrics(object):
-    __slots__ = ["handle", "_handle"]
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_init_layers(self.handle)
 
-    def __init__(self, handle):
-        self.handle = handle
-        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
+    def prefetch(self):
+        ffc().flexflow_model_prefetch(self.handle)
 
-    def get_accuracy(self):
-        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
+    def forward(self, seq_length=None):
+        """Forward propagation of all layers.
+
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_forward(self.handle, seq_length)
+
+    # TODO: seperate compute_metrics from backward
+    def backward(self, seq_length=None):
+        """Backward propagation of all layers.
+
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_backward(self.handle, seq_length)
 
+    def compute_metrics(self):
+        """Compute performance metrics.
 
-# -----------------------------------------------------------------------
-# NetConfig
-# -----------------------------------------------------------------------
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_compute_metrics(self.handle)
 
+    def update(self):
+        """Update weights and biases of all layers.
 
-class NetConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_net_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
-        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cpath)
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_update(self.handle)
 
+    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
+        """Configure the model for trainting. FlexFlow uses lazy initialization,
+        so the actual creating of all operations (including creating and partitioning
+        of weight, bias and output tensors) happen during compile.
 
-# -----------------------------------------------------------------------
-# DLRMConfig
-# -----------------------------------------------------------------------
+        :param optimizer: optimizer instance.
+        :type optimizer: Optimizer
 
+        :param loss_type: Enum of LossType.
+          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
+        :type loss_type: LossType
 
-class DLRMConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_dlrm_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
+        :param metrics: List of metrics to be evaluated by the model during training and testing.
+          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
+          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
+        :type metrics: MetricsType
 
-        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cstr)
+        :param comp_mode: Enum of CompMode.
+          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
+        :type comp_mode: CompMode
 
-        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
-        self.arch_interaction_op = ffi.string(cstr)
+        :returns:  None -- no returns.
+        """
+        self.optimizer = optimizer
 
-        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
-            self.handle
-        )
-        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
-        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
-        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
-            self.handle
+        c_loss_type = enum_to_int(LossType, loss_type)
+        metrics_int = []
+        for metric in metrics:
+            metrics_int.append(enum_to_int(MetricsType, metric))
+        c_metrics = ffi.new("int[]", metrics_int)
+        if comp_mode == None:
+            comp_mode = CompMode.TRAINING
+        c_comp_mode = enum_to_int(CompMode, comp_mode)
+        ffc().flexflow_model_compile(
+            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
         )
-        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
-
-        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
-        self.mlp_bot = []
-        for i in range(0, mlp_bot_c[0]):
-            self.mlp_bot.append(mlp_bot_c[i + 1])
-
-        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
-        self.mlp_top = []
-        for i in range(0, mlp_top_c[0]):
-            self.mlp_top.append(mlp_top_c[i + 1])
+        for ff_tensor, np_tensor in self.attr_tensors.items():
+            ff_tensor.set_tensor(self, np_tensor)
+        print("Compiled ffmodel!")
 
-        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
-        self.embedding_size = []
-        for i in range(0, embedding_size_c[0]):
-            self.embedding_size.append(embedding_size_c[i + 1])
+    def fit(self, x=None, y=None, batch_size=None, epochs=1):
+        """Trains the model for a fixed number of epochs (iterations on a dataset).
 
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
-# -----------------------------------------------------------------------
-# Single DataLoader
-# -----------------------------------------------------------------------
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
 
-class SingleDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
 
-    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
-        assert type(input) is Tensor, "SingleDataLoader input is wrong"
-        if type(full_input) is Tensor:
-            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
         else:
-            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
-
-    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create(
-            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
-        )
+            dataloaders = x
+        dataloaders.append(y)
 
-    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
-        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create2(
-            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
-        )
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for epoch in range(0, epochs):
+            for d in dataloaders:
+                d.reset()
+            self.reset_metrics()
+            iterations = num_samples / batch_size
+            for iter in range(0, int(iterations)):
+                self._ffconfig.begin_trace(self._tracing_id)
+                for d in dataloaders:
+                    d.next_batch(self)
+                self.forward()
+                self.zero_gradients()
+                self.backward()
+                self.update()
+                self._ffconfig.end_trace(self._tracing_id)
 
-    @property
-    def num_samples(self):
-        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
+    def eval(self, x=None, y=None, batch_size=None):
+        """Returns the loss value & metrics values for the model in test mode.
 
-    @num_samples.setter
-    def num_samples(self, samples):
-        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
-    def next_batch(self, ffmodel):
-        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
-        :returns:  None -- no returns.
-        """
-        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
 
-    def reset(self):
-        """Reset the current position of the dataloder to 0.
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
 
         :returns:  None -- no returns.
         """
-        ffc().flexflow_single_dataloader_reset(self.handle)
-
-
-class RegionNdarray(object):
-    __slots__ = ["__array_interface__"]
-
-    def __init__(self, shape, data_type, base_ptr, strides, read_only):
-        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
-        if data_type == DataType.DT_HALF:
-            field_type = "<f2"
-        elif data_type == DataType.DT_FLOAT:
-            field_type = "<f4"
-        elif data_type == DataType.DT_INT32:
-            field_type = "<i4"
+        if isinstance(x, list) == False:
+            dataloaders = [x]
         else:
-            assert 0, "unknown data type"
-            field_type = "<f4"
-        self.__array_interface__ = {
-            "version": 3,
-            "shape": shape,
-            "typestr": field_type,
-            "data": (base_ptr, read_only),
-            "strides": strides,
-        }
+            dataloaders = x
+        dataloaders.append(y)
 
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        for d in dataloaders:
+            d.reset()
+        self.reset_metrics()
+        iterations = num_samples / batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for iter in range(0, int(iterations)):
+            for d in dataloaders:
+                d.next_batch(self)
+            self._ffconfig.begin_trace(self._tracing_id)
+            self.forward()
+            self.compute_metrics()
+            self._ffconfig.end_trace(self._tracing_id)
 
-# -----------------------------------------------------------------------
-# BatchConfig
-# -----------------------------------------------------------------------
+    def zero_gradients(self):
+        """Empty the gradients of all layers.
 
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_zero_gradients(self.handle)
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+    def set_optimizer(self, optimizer):
+        if isinstance(optimizer, SGDOptimizer) == True:
+            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
+        elif isinstance(optimizer, AdamOptimizer) == True:
+            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
+        elif optimizer == None:
+            pass
+        else:
+            assert 0, "[Model]: unknown optimizer"
 
-    def __init__(self):
-        self.handle = ffc().flexflow_batch_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
+    optimizer = property(fset=set_optimizer)
 
+    def print_layers(self, id=-1):
+        ffc().flexflow_model_print_layers(self.handle, id)
 
-# -----------------------------------------------------------------------
-# TreeVerifyBatchConfig
-# -----------------------------------------------------------------------
+    def get_layer_by_id(self, layer_id):
+        return self._layers[layer_id]
 
+    def get_last_layer(self):
+        return self._layers[self._nb_layers - 1]
 
-class TreeVerifyBatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+    def get_layer_by_name(self, layer_name):
+        for layer_id in self._layers:
+            layer = self._layers[layer_id]
+            if layer.name == layer_name:
+                return layer
+        assert 0, f"Cannot find the layer with name {layer_name}"
+        return None
 
-    def __init__(self):
-        self.handle = ffc().flexflow_tree_verify_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
-        )
+    def get_tensor_by_id(self, id):
+        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
+        return Parameter(handle)
 
+    @property
+    def label_tensor(self):
+        handle = ffc().flexflow_model_get_label_tensor(self.handle)
+        return Tensor(handle, deallocate=False)
 
-# -----------------------------------------------------------------------
-# BeamSearchBatchConfig
-# -----------------------------------------------------------------------
+    def get_perf_metrics(self):
+        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
+        return PerfMetrics(handle)
 
+    def set_transformer_layer_id(self, id):
+        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+    def create_data_loader(self, batch_tensor, full_array):
+        """Create a SingleDataloader instance.
 
-    def __init__(self):
-        self.handle = ffc().flexflow_beam_search_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_beam_search_batch_config_destroy
-        )
+        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
+        :type batch_tensor: Tensor
 
+        :param full_array: the entire data.
+        :type full_array: Numpy Array
 
-# -----------------------------------------------------------------------
-# RequestManager
-# -----------------------------------------------------------------------
+        :returns:  SingleDataloader -- returns a dataloader instance.
+        """
 
+        if self._ffconfig.enable_control_replication:
+            assert (
+                self._ffconfig.python_data_loader_type != 1
+            ), "To enable control replication, please set --python-data-loader-type 2"
+            return self.__create_data_loader_ptr(batch_tensor, full_array)
+        else:
+            if self._ffconfig.python_data_loader_type == 1:
+                return self.__create_data_loader_attach(batch_tensor, full_array)
+            else:
+                return self.__create_data_loader_ptr(batch_tensor, full_array)
 
-class RequestManager(object):
-    __slots__ = ["handle"]
+    def __create_data_loader_attach(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        num_dim = len(full_array_shape)
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
 
-    def __init__(self):
-        self.handle = ffc().flexflow_request_manager_get_request_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
+        if num_dim == 2:
+            full_tensor = self.create_tensor(
+                [num_samples, full_array_shape[1]], datatype
+            )
+            self.map_tensor(full_tensor)
+        elif num_dim == 4:
+            full_tensor = self.create_tensor(
+                [
+                    num_samples,
+                    full_array_shape[1],
+                    full_array_shape[2],
+                    full_array_shape[3],
+                ],
+                datatype,
+            )
+            self.map_tensor(full_tensor)
+        else:
+            assert 0, "unsupported dims"
 
-    def register_tokenizer(
-        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
-    ):
-        c_model_type = enum_to_int(ModelType, model_type)
-        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
-        return ffc().flexflow_request_manager_register_tokenizer(
-            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+        full_tensor.attach_numpy_array(self._ffconfig, full_array)
+        dataloader = SingleDataLoader(
+            self, batch_tensor, full_tensor, num_samples, datatype
         )
+        full_tensor.detach_numpy_array(self._ffconfig)
 
-    def register_output_filepath(self, output_filepath):
-        c_output_filepath = get_c_name(output_filepath)
-        return ffc().flexflow_request_manager_register_output_filepath(
-            self.handle, c_output_filepath
-        )
+        return dataloader
 
-    def register_ssm_model(self, model):
-        return ffc().flexflow_request_manager_register_ssm_model(
-            self.handle, model.handle
+    def __create_data_loader_ptr(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
+        np_raw_ptr = full_array.__array_interface__["data"]
+        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+        print(
+            "numpy array: %s, %s, %s"
+            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
         )
-
-    def set_max_requests_per_batch(self, max_requests):
-        return ffc().flexflow_request_manager_set_max_requests_per_batch(
-            self.handle, max_requests)
-    
-    def set_max_tokens_per_batch(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
-            self.handle, max_tokens)
-    
-    def set_max_sequence_length(self, max_length):
-        return ffc().flexflow_request_manager_set_max_sequence_length(
-            self.handle, max_length)
-
-    def start_server(self, model):
-        return ffc().flexflow_request_manager_start_background_server(
-            self.handle, model.handle
+        dataloader = SingleDataLoader(
+            self, batch_tensor, raw_ptr, num_samples, datatype
         )
 
-    def stop_server(self):
-        return ffc().flexflow_request_manager_terminate_background_server(
-            self.handle)
-# -----------------------------------------------------------------------
-# InferenceManager
-# -----------------------------------------------------------------------
-
-
-class InferenceManager(object):
-    __slots__ = ["handle"]
+        return dataloader
 
-    def __init__(self):
-        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
+    def __get_initializer_handle(self, initializer):
+        if initializer == None:
+            null_initializer = Initializer(None)
+            return null_initializer.handle
+        else:
+            return initializer.handle
 
-    def compile_model_and_allocate_buffer(self, model):
-        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
-            self.handle, model.handle
-        )
+    def __get_op_handle(self, shared_op):
+        if shared_op == None:
+            op_handle = ffi.new("flexflow_op_t *")
+            op_handle.impl = ffi.NULL
+            op = Op(op_handle[0])
+        else:
+            op = shared_op
+        return op.handle
 
-    def init_operators_inference(self, model):
-        ffc().flexflow_inference_manager_init_operators_inference(
-            self.handle, model.handle
+    def get_output_tensor(self, ffmodel, data_type):
+        shape = self.dims
+        if data_type == DataType.DT_HALF:
+            np_array = np.empty(shape, dtype=np.float16)
+        elif data_type == DataType.DT_FLOAT:
+            np_array = np.empty(shape, dtype=np.float32)
+        elif self.data_type == DataType.DT_INT32:
+            np_array = np.empty(shape, dtype=np.int32)
+        elif self.data_type == DataType.DT_INT64:
+            np_array = np.empty(shape, dtype=np.int64)
+        else:
+            assert 0, f"Unsupported datatype: {self.data_type}"
+        np_raw_ptr = np_array.__array_interface__["data"]
+        if np_array.dtype == np.float32:
+            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_float(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int32:
+            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int64:
+            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int64(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        fflogger.debug(
+            "get weights raw_ptr: %s, %s, %s, %s"
+            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
         )
+        assert ret_val == True
+        return np_array
 
-    def register_model_weights_loader(self, model, fileloader):
-        ffc().flexflow_inference_manager_register_model_weights_loader(
-            self.handle, model.handle, fileloader.handle
+    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
+        assert isinstance(prompt_list, list)
+        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
+        max_num_chars = 5 * (max_sequence_length + 100)
+        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, RequestType.REQ_INFERENCE)
+            for prompt in prompt_list
+        ]
+        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        peft_model_ids = [None for prompt in prompt_list]
+        dataset_filepaths = [None for prompt in prompt_list]
+        training_steps = [0 for prompt in prompt_list]
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(prompt_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
         )
+        from flexflow.serve import GenerationResult
 
-# -----------------------------------------------------------------------
-# FileDataLoader
-# -----------------------------------------------------------------------
-
-
-class FileDataLoader(object):
-    __slots__ = ["handle", "_handle"]
-
-    def __init__(
-        self,
-        weight_file_path,
-        num_q_heads,
-        num_kv_heads,
-        hidden_dim,
-        qkv_inner_dim,
-        tensor_parallelism_degree,
-        use_full_precision
-    ):
-        c_weight_file_path = get_c_name(weight_file_path)
-        self.handle = ffc().flexflow_file_data_loader_create(
-            c_weight_file_path,
-            num_q_heads,
-            num_kv_heads,
-            hidden_dim,
-            qkv_inner_dim,
-            tensor_parallelism_degree,
-            use_full_precision
+        return [
+            GenerationResult(ffi.string(c_output_text), [])
+            for c_output_text in c_output_texts
+        ]
+
+    def generate(self, requests_list: List[Request]):
+        assert isinstance(requests_list, list)
+        c_input_texts = [
+            get_c_name(request.prompt) for request in requests_list
+        ]  # entry will be None for finetuning requests
+        c_output_texts = [
+            (
+                ffi.new("char[]", 5 * (request.max_sequence_length + 100))
+                if request.req_type == RequestType.REQ_INFERENCE
+                else ffi.NULL
+            )
+            for request in requests_list
+        ]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", request.max_sequence_length + 100)
+            for request in requests_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, request.req_type) for request in requests_list
+        ]
+        max_sequence_lengths = [
+            request.max_sequence_length for request in requests_list
+        ]
+        peft_model_ids = [request.peft_model_id for request in requests_list]
+        dataset_filepaths = [
+            get_c_name(request.dataset_filepath) for request in requests_list
+        ]
+        training_steps = [request.max_training_steps for request in requests_list]
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(requests_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
+        return [
+            (
+                GenerationResult(ffi.string(c_output_text), [])
+                if c_output_text != ffi.NULL
+                else None
+            )
+            for c_output_text in c_output_texts
+        ]
 
-    def load_weights(self, model):
-        # Check data type and create use_full_precision boolean
-        #assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
-        #use_full_precision = data_type == DataType.DT_FLOAT
-        ffc().flexflow_file_data_loader_load_weights(
-            self.handle, model.handle
-        )
+    def set_position_offset(self, offset):
+        ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 5805670ae0..22f4779033 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -15,7 +15,16 @@
 from typing import Optional
 from ..type import *
 from flexflow.core import *
-from .serve import LLM, SSM, PEFT, GenerationConfig, GenerationResult
+from .serve import (
+    LLM,
+    SSM,
+    GenerationConfig,
+    GenerationResult,
+    LoraLinearConfig,
+    PEFTModelID,
+    Request,
+    RequestType,
+)
 
 
 def __check_positive_int(configs_dict: dict, key: str):
@@ -214,7 +223,7 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 8*1024**3
+        configs_dict["offload_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
@@ -222,7 +231,7 @@ def init(
     if configs_dict.get("enable_peft", None) is None:
         configs_dict["enable_peft"] = False
     if configs_dict.get("peft_activation_reserve_space_size", None) is None:
-        configs_dict["peft_activation_reserve_space_size"] = 8*1024**3
+        configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("peft_weight_reserve_space_size", None) is None:
         configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index db2f403e10..0176a1dda1 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -118,7 +118,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, att_norm = ffmodel.residual_layer_norm(
@@ -129,7 +129,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -147,7 +147,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
@@ -191,7 +191,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size * 4,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_h_to_4h",
+                name=f"layers.{i}.mlp.dense_h_to_4h",
             )
             dense_h_to_4h = ffmodel.gelu(dense_h_to_4h)
             mlp_output = ffmodel.dense(
@@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_4h_to_h",
+                name=f"layers.{i}.mlp.dense_4h_to_h",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -233,17 +233,16 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
     # TODO: finish this
     def convert_hf_weight_name(name):
-        return (
-            name.replace(".", "_")
-            .replace("transformer_h_", "layers_")
-            .replace("transformer_", "")
-            .replace("self_attention_dense", "attention_wo")
+        return (name.replace("transformer.h.", "layers.")
+            .replace("transformer.", "")
+            .replace("self_attention.dense", "self_attention.o_proj")
         )
 
     def convert_hf_model(model, dst_folder):
@@ -256,10 +255,10 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowFalcon.convert_hf_weight_name(name)
             # Split Q,K,V attention weights
-            if "self_attention_query_key_value" in name:
-                name_q = name.replace("self_attention_query_key_value", "attention_wq")
-                name_k = name.replace("self_attention_query_key_value", "attention_wk")
-                name_v = name.replace("self_attention_query_key_value", "attention_wv")
+            if "self_attention.query_key_value" in name:
+                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
+                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
+                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -276,5 +275,5 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         # LM head weight
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index cd9cf29ebf..947878f706 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -101,7 +101,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="tok_embeddings",
+            name="embed_tokens",
         )
 
         for i in range(self.llama_config.num_hidden_layers):
@@ -112,7 +112,7 @@ def build_model(self, max_tokens_per_batch):
                     token,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, attn_norm = ffmodel.residual_rms_norm(
@@ -120,7 +120,7 @@ def build_model(self, max_tokens_per_batch):
                     w2,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -140,7 +140,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -178,7 +178,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -188,21 +188,21 @@ def build_model(self, max_tokens_per_batch):
                 mha,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
-                name=f"layers_{i}_ffn_norm",
+                name=f"layers.{i}.post_attention_layernorm",
             )
             w1 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w1",
+                name=f"layers.{i}.mlp.gate_proj",
             )
             w3 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w3",
+                name=f"layers.{i}.mlp.up_proj",
             )
             multi = ffmodel.sigmoid_silu_multi(w1, w3)
             w2 = ffmodel.dense(
@@ -210,7 +210,7 @@ def build_model(self, max_tokens_per_batch):
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w2",
+                name=f"layers.{i}.mlp.down_proj",
             )
 
         _, token = ffmodel.residual_rms_norm(
@@ -225,7 +225,7 @@ def build_model(self, max_tokens_per_batch):
             self.llama_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="output",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -241,28 +241,13 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(dense, 1, False)
-                output = ffmodel.argmax(dense, False)
+                softmax = ffmodel.softmax(dense, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
     def convert_hf_weight_name(name):
-        return (
-            name.replace(".", "_")
-            .replace("self_attn", "attention")
-            .replace("q_proj", "wq")
-            .replace("k_proj", "wk")
-            .replace("v_proj", "wv")
-            .replace("o_proj", "wo")
-            .replace("mlp", "feed_forward")
-            .replace("gate_proj", "w1")
-            .replace("down_proj", "w2")
-            .replace("up_proj", "w3")
-            .replace("input_layernorm", "attention_norm")
-            .replace("post_attention_layernorm", "ffn_norm")
-            .replace("embed_tokens", "tok_embeddings")
-            .replace("lm_head", "output")
-            .replace("model_", "")
-        )
+        return name.replace("model.", "")
 
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 9168932ce1..1d1837c478 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -92,7 +92,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
 
         axes = [
@@ -109,7 +109,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
             else:
                 hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -143,7 +143,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention_verify(
@@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention(
@@ -185,7 +185,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             else:
                 assert False
@@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch):
                 True,
                 1e-05,
                 False,
-                name=f"layers_{i}_norm_2",
+                name=f"layers.{i}.norm_2",
             )
             # mlp
             layernorm_output = ffmodel.dense(
@@ -207,7 +207,7 @@ def build_model(self, max_tokens_per_batch):
                 4 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_up_proj",
+                name=f"layers.{i}.ffn.up_proj",
             )
             layernorm_output = ffmodel.gelu(layernorm_output)
             intermediate_output = ffmodel.dense(
@@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_down_proj",
+                name=f"layers.{i}.ffn.down_proj",
             )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -227,7 +227,7 @@ def build_model(self, max_tokens_per_batch):
             True,
             1e-05,
             False,
-            name=f"transformer_norm_f",
+            name=f"norm_f",
         )
         lm_head = ffmodel.dense(
             all_final_norm,
@@ -244,7 +244,8 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
@@ -252,8 +253,8 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_weight_name(name):
         return (
             name.replace("transformer.blocks.", "layers.")
-            .replace(".", "_")
-            .replace("attn_out_proj", "attention_wo")
+            .replace("transformer.", "")
+            .replace("attn.out_proj", "attn.o_proj")
         )
 
     def convert_hf_model(model, dst_folder):
@@ -261,9 +262,9 @@ def convert_hf_model(model, dst_folder):
         for name, params in model.named_parameters():
             name = FlexFlowMPT.convert_hf_weight_name(name)
             if "Wqkv" in name:
-                name_q = name.replace("attn_Wqkv", "attention_wq")
-                name_k = name.replace("attn_Wqkv", "attention_wk")
-                name_v = name.replace("attn_Wqkv", "attention_wv")
+                name_q = name.replace("attn.Wqkv", "attn.q_proj")
+                name_k = name.replace("attn.Wqkv", "attn.k_proj")
+                name_v = name.replace("attn.Wqkv", "attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -280,6 +281,6 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
         shutil.copy(
-            os.path.join(dst_folder, "transformer_wte_weight"),
-            os.path.join(dst_folder, "lm_head_weight"),
+            os.path.join(dst_folder, "wte.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 9a03cf6e78..cde25f2241 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -133,7 +133,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_attention_layer_norm",
+                    name=f"layers.{i}.self_attn_layer_norm",
                 )
             else:
                 hidden_states = ffmodel.add(token, positional_embedding)
@@ -157,7 +157,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multihead_self_attention_verify(
@@ -177,7 +177,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multihead_self_attention(
@@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -209,7 +209,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 self.opt_config.layer_norm_elementwise_affine,
                 1e-05,
-                name=f"layers_{i}_add_bias_residual_layer_norm",
+                name=f"layers.{i}.add_bias_residual_layer_norm",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -220,14 +220,14 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.ffn_dim,
                 ActiMode.AC_MODE_RELU,
                 True,
-                name=f"layers_{i}_fc1",
+                name=f"layers.{i}.fc1",
             )
             fc2 = ffmodel.dense(
                 fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_fc2",
+                name=f"layers.{i}.fc2",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -239,7 +239,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_final_layer_norm",
+                    name=f"layers.{i}.final_layer_norm",
                 )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -257,7 +257,7 @@ def build_model(self, max_tokens_per_batch):
             self.opt_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="embed_tokens_weight_lm_head",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -273,23 +273,19 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
     def convert_hf_weight_name(name):
         return (
-            name.replace(".", "_")
-            .replace("decoder_", "")
-            .replace("model_", "")
-            .replace("self_attn", "attention")
-            .replace("q_proj", "wq")
-            .replace("k_proj", "wk")
-            .replace("v_proj", "wv")
-            .replace("out_proj", "wo")
-            .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias")
+            name.replace("decoder.", "")
+            .replace("model.", "")
+            .replace("self_attn.out_proj", "self_attn.o_proj")
+            .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias")
             .replace(
-                "_final_layer_norm", "_add_bias_residual_layer_norm"
+                ".final_layer_norm", ".add_bias_residual_layer_norm"
             )  # important to use the leading "_" to avoid matching the last LayerNorm
         )
 
@@ -300,6 +296,6 @@ def convert_hf_model(model, dst_folder):
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
-            os.path.join(dst_folder, "embed_tokens_weight"),
-            os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
+            os.path.join(dst_folder, "embed_tokens.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index cd6a7304e6..80b4be10bb 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
         positional_embedding = ffmodel.embedding(
             position_tensor,
@@ -116,7 +116,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wpe",
+            name="wpe",
         )
 
         axes = [
@@ -134,7 +134,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_1",
+                name=f"layers.{i}.ln_1",
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
@@ -154,7 +154,7 @@ def build_model(self, max_tokens_per_batch):
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
                 False,  # apply_rotary_embedding
-                name=f"layers_{i}_attention",
+                name=f"layers.{i}.attn.c_attn",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -166,7 +166,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_2",
+                name=f"layers.{i}.ln_2",
             )
 
             # mlp
@@ -176,7 +176,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_fc",
+                name=f"layers.{i}.mlp.c_fc",
             )
             activation = ffmodel.gelu(c_fc, False)
             c_proj = ffmodel.dense(
@@ -184,7 +184,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_proj",
+                name=f"layers.{i}.mlp.c_proj",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -195,7 +195,7 @@ def build_model(self, max_tokens_per_batch):
             axes,
             True,
             self.starcoder_config.layer_norm_epsilon,
-            name=f"transformer_ln_f",
+            name=f"ln_f",
         )
         lm_head = ffmodel.dense(
             ln_f,
@@ -212,18 +212,19 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace(".", "_")
-            if "c_attn_weight" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            name = name.replace("transformer.h", "layers").replace("transformer", "")
+            if "attn.c_attn.weight" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -236,10 +237,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_attn_bias" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            elif "attn.c_attn.bias" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -252,14 +253,14 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_proj_bias" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.bias" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
-            elif "c_proj_weight" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.weight" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index da2f1246a2..538abe3858 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -34,41 +34,6 @@
 from typing import Union, List
 
 
-class GenerationConfig:
-    """A class to store the sampling configs."""
-
-    def __init__(
-        self,
-        do_sample: bool = False,
-        temperature: float = 0.9,
-        topp: float = 0.8,
-        topk: int = 1,
-    ):
-        """Initialize the sampling configs
-
-        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
-        :type do_sample: bool, optional
-        :param temperature: The temperature setting, defaults to 0.9
-        :type temperature: float, optional
-        :param topp: The top probabilities (top-p) setting, defaults to 0.8
-        :type topp: float, optional
-        :param topk: The top-k setting, defaults to 1
-        :type topk: int, optional
-        """
-        self.do_sample = do_sample
-        self.temperature = temperature
-        self.topp = topp
-        self.topk = topk
-
-
-class GenerationResult:
-    """A class to store the output of a generation request."""
-
-    def __init__(self, text: str = None, tokens: list = None):
-        self.output_text = text
-        self.output_tokens = tokens
-
-
 class _SupportedModels:
     def __init__(
         self,
@@ -137,30 +102,81 @@ def __init__(
         self.refresh_cache = refresh_cache
         self.output_file = output_file
         self.rm = None
+        self.pefts = {}
 
     def __del__(self):
         # Stop the background server before deleting the object
         if type(self) == LLM and self.rm is not None:
             self.rm.stop_server()
 
+    def add_peft(self, peft_model_id: str):
+        """Add a previously created PEFT adapter to the LLM. The PEFT model should already exist locally or be available on HuggingFace"""
+        peft_config = PeftConfig.from_pretrained(peft_model_id)
+        peft_type = peft_config.peft_type
+        if peft_type != "LORA":
+            raise RuntimeError(f"PEFT type {peft_type} not yet supported in FlexFlow")
+        if "base_model_name_or_path" not in peft_config.to_dict():
+            raise ValueError(
+                f"PEFT model {peft_model_id} does not have an associated base model"
+            )
+        if peft_config.base_model_name_or_path != self.model_name:
+            raise RuntimeError(
+                f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
+            )
+        peft_dict = {
+            "peft_config": peft_config,
+            "peft_type": peft_type,
+        }
+        self.pefts[peft_model_id] = peft_dict
+
+    def get_ff_peft_id(self, peft_model_id: str) -> PEFTModelID:
+        if peft_model_id not in self.pefts:
+            raise ValueError(
+                f"PEFT {peft_model_id} not registered with LLM {self.model_name}"
+            )
+        peft_dict = self.pefts[peft_model_id]
+        if "ff_peft_model_id" not in peft_dict:
+            raise RuntimeError(
+                f"Attempting to run PEFT {peft_model_id} before compiling LLM {self.model_name}"
+            )
+        return peft_dict["ff_peft_model_id"]
+
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
+        config_dir = os.path.join(
             os.path.expanduser(self.cache_path), "configs", self.model_name.lower()
         )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.model_name} configs to file {self.config_path}...")
-        self.hf_config.to_json_file(self.config_path)
+        config_path = os.path.join(config_dir, "config.json")
+        os.makedirs(config_dir, exist_ok=True)
+        print(f"Creating directory {config_dir} (if it doesn't exist)...")
+        print(f"Saving {self.model_name} configs to file {config_path}...")
+        self.hf_config.to_json_file(config_path)
+
+        # Save PEFT configs if the LLM has any registered PEFTs
+        for peft_model_id, peft_dict in self.pefts.items():
+            peft_config = peft_dict["peft_config"]
+            peft_config_dir = os.path.join(
+                os.path.expanduser(self.cache_path),
+                "configs",
+                peft_model_id.lower(),
+            )
+            os.makedirs(peft_config_dir, exist_ok=True)
+            peft_config_path = os.path.join(peft_config_dir, "config.json")
+            print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
+            with open(peft_config_path, "w") as json_file:
+
+                class SetEncoder(json.JSONEncoder):
+                    def default(self, obj):
+                        if isinstance(obj, set):
+                            return list(obj)
+                        return super().default(obj)
 
-    def __get_revision_hashes(self, model_name: str, weights: bool):
+                json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+
+    def __get_revision_hashes(self, model_name: str, folder: str):
         ff_revision = None
-        ff_revision_file = (
-            os.path.join(self.weights_path, "rev_sha.txt")
-            if weights
-            else os.path.join(self.tokenizer_path, "rev_sha.txt")
-        )
+        ff_revision_file = os.path.join(folder, "rev_sha.txt")
+
         if os.path.exists(ff_revision_file):
             ff_revision = "".join(open(ff_revision_file).read().split())
 
@@ -180,64 +196,107 @@ def __get_revision_hashes(self, model_name: str, weights: bool):
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
+
+        If any PEFT adapter is registered, perform the same operation for PEFT.
         """
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.model_name.lower(),
-            "full-precision"
-            if self.data_type == DataType.DT_FLOAT
-            else "half-precision",
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..."
-            )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=True
-        )
+        def get_weights_path(model_name):
+            return os.path.join(
+                os.path.expanduser(self.cache_path),
+                "weights",
+                model_name.lower(),
+                (
+                    "full-precision"
+                    if self.data_type == DataType.DT_FLOAT
+                    else "half-precision"
+                ),
+            )
 
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
+        def refresh_cache_if_needed(model_name):
+            weights_path = get_weights_path(model_name)
+            if self.refresh_cache:
                 print(
-                    f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
+                    f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
                 )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local model weights were updated! Converting new weights now..."
-                )
-            # Download model from HuggingFace, or load it from the local folder
-            hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name,
+                if os.path.exists(weights_path):
+                    shutil.rmtree(weights_path)
+            os.makedirs(weights_path, exist_ok=True)
+
+        def get_hf_llm(model_name):
+            return AutoModelForCausalLM.from_pretrained(
+                model_name,
                 trust_remote_code=True,
-                torch_dtype=torch.float32
-                if self.data_type == DataType.DT_FLOAT
-                else torch.float16,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
             )
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.model_class.convert_hf_model(hf_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-            # Deallocate hf model
-            del hf_model
-            gc.collect()
-            torch.cuda.empty_cache()
-        else:
-            print(f"Loading '{self.model_name}' model weights from the cache...")
+
+        def download_llm_weights():
+            refresh_cache_if_needed(self.model_name)
+            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+                self.model_name, self.weights_path
+            )
+            if ff_revision != latest_revision:
+                print(
+                    f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
+                )
+                hf_model = get_hf_llm(self.model_name)
+                # Convert the model to FlexFlow format
+                self.model_class.convert_hf_model(hf_model, self.weights_path)
+                # Save new revision hash to file
+                with open(ff_revision_file, "w+") as f:
+                    f.write(latest_revision)
+                print(f"Done converting the weights for model {self.model_name}")
+                # Deallocate hf model
+                del hf_model
+                gc.collect()
+                torch.cuda.empty_cache()
+
+        def convert_peft_model(hf_peft_model, peft_type, weights_path):
+            for name, params in hf_peft_model.named_parameters():
+                if peft_type.lower() in name:
+                    name = name.replace("base_model.model.model.", "").replace(
+                        ".default", ""
+                    )
+                    name = self.model_class.convert_hf_weight_name(name)
+                    params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+
+        def download_peft_weights():
+            for peft_model_id, peft_dict in self.pefts.items():
+                peft_config = peft_dict["peft_config"]
+                peft_type = peft_dict["peft_type"]
+
+                weights_path = get_weights_path(peft_model_id)
+                refresh_cache_if_needed(peft_model_id)
+                ff_revision, ff_revision_file, latest_revision = (
+                    self.__get_revision_hashes(peft_model_id, weights_path)
+                )
+
+                if ff_revision != latest_revision:
+                    print(
+                        f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
+                    )
+                    hf_model = get_hf_llm(peft_model_id)
+                    hf_peft_model = PeftModel.from_pretrained(
+                        hf_model, peft_model_id, config=peft_config
+                    )
+                    # Convert the model to FlexFlow format
+                    convert_peft_model(hf_peft_model, peft_type, weights_path)
+                    # Save new revision hash to file
+                    with open(ff_revision_file, "w+") as f:
+                        f.write(latest_revision)
+                    print(f"Done converting the weights for model {peft_model_id}")
+                    # Deallocate hf model
+                    del hf_peft_model
+                    del hf_model
+                    gc.collect()
+                    torch.cuda.empty_cache()
+
+        self.weights_path = get_weights_path(self.model_name)
+        download_llm_weights()
+        download_peft_weights()
 
     def download_hf_tokenizer_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date.
@@ -253,7 +312,7 @@ def download_hf_tokenizer_if_needed(self):
         )
         if self.refresh_cache:
             print(
-                f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..."
+                f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
             )
             if os.path.exists(self.tokenizer_path):
                 shutil.rmtree(self.tokenizer_path)
@@ -263,20 +322,13 @@ def download_hf_tokenizer_if_needed(self):
 
         # Get local revision SHA, check if it matches latest one on huggingface
         ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=False
+            self.model_name, self.tokenizer_path
         )
 
         if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
-                print(
-                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
-                )
+            print(
+                f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
+            )
             # Download tokenizer from HuggingFace, or load it from the local folder
             if self.model_type == ModelType.LLAMA:
                 hf_tokenizer = LlamaTokenizer.from_pretrained(
@@ -284,19 +336,13 @@ def download_hf_tokenizer_if_needed(self):
                 )
             else:
                 hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # Print log message to notify user download of tokenizer has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading tokenizer. Saving it now...")
             # Save tokenizer
             hf_tokenizer.save_pretrained(self.tokenizer_path)
-            print("Done saving HF tokenizer.")
+            print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
 
-        else:
-            print(f"Loading '{self.model_name}' tokenizer from the cache...")
-
     def compile(
         self,
         generation_config: GenerationConfig = GenerationConfig(),
@@ -374,6 +420,15 @@ def compile(
             max_tokens_per_batch,
         )
 
+        # Add PEFT layer if registered
+        for peft_model_id, peft_dict in self.pefts.items():
+            # ff_peft_config = peft_dict["ff_peft_config"]
+            ff_peft_config = LoraLinearConfig(
+                os.path.expanduser(self.cache_path), peft_model_id
+            )
+            ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
+            peft_dict["ff_peft_model_id"] = ff_peft_model_id
+
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
@@ -420,22 +475,36 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
+    def generate(
+        self,
+        requests_or_prompts: Union[str, List[str], Request, List[Request]],
+        max_length: int = 128,
+    ):
         """Generate tokens based on the input prompt(s)
 
-        :param prompts: The generation prompt(s) in the form of a string, or list of strings
-        :type prompts: Union[str, List[str]]
+        :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
+        :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
         :return: the generation results
         :rtype: GenerationResult
         """
-        if type(prompts) == str:
-            if len(prompts) == 0:
+        if type(requests_or_prompts) == str:
+            if len(requests_or_prompts) == 0:
                 return None
-            return self.model.ffmodel.generate([prompts], max_length)
-        elif type(prompts) == list:
-            if len(prompts) == 0:
+            return self.model.ffmodel.generate_inf_only(
+                [requests_or_prompts], max_length
+            )
+        elif type(requests_or_prompts) == Request:
+            return self.model.ffmodel.generate(requests_or_prompts)
+        elif type(requests_or_prompts) == list:
+            if len(requests_or_prompts) == 0:
                 return []
-            return self.model.ffmodel.generate(prompts, max_length)
+            if type(requests_or_prompts[0]) == str:
+                return self.model.ffmodel.generate_inf_only(
+                    requests_or_prompts, max_length
+                )
+            else:
+                print(requests_or_prompts)
+                return self.model.ffmodel.generate(requests_or_prompts)
         else:
             assert False, "Please pass a non-empty string or list of strings"
 
@@ -447,17 +516,6 @@ def stop_server(self):
         self.rm.stop_server()
         print("Background server stopped.")
 
-    def __enter__(self):
-        # Start the server when entering the context
-        # self.rm.start_server(self.model.ffmodel)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        # Stop the server when exiting the context
-        # self.rm.stop_server()
-        if exc_type:
-            print(f"Exception occurred: {exc_value}")
-
 
 class SSM(LLM):
     """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace"""
@@ -533,152 +591,3 @@ def compile(
             model_specific_pipeline_parallelism_degree,
             ssms,
         )
-
-
-class PEFT:
-    """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM"""
-
-    def __init__(
-        self,
-        peft_model_id: str,
-        data_type: DataType = DataType.DT_HALF,
-        cache_path: str = "",
-        refresh_cache: bool = False,
-    ):
-        self.hf_config = PeftConfig.from_pretrained(peft_model_id)
-        self.peft_model_id = peft_model_id
-        self.peft_type = self.hf_config.peft_type
-        if self.peft_type != "LORA":
-            raise RuntimeError(
-                f"PEFT type {self.peft_type} not yet supported in FlexFlow"
-            )
-        self.data_type = data_type
-        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
-        self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
-        self.refresh_cache = refresh_cache
-        # Base model related
-        if "base_model_name_or_path" not in self.hf_config.to_dict():
-            raise ValueError(
-                f"PEFT model {peft_model_id} does not have an associated based model"
-            )
-        self.base_model = LLM(
-            self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache
-        )
-
-    def download_hf_config(self):
-        """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
-            os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower()
-        )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.peft_model_id} configs to file {self.config_path}...")
-        with open(self.config_path, "w") as json_file:
-            class SetEncoder(json.JSONEncoder):
-                def default(self, obj):
-                    if isinstance(obj, set):
-                        return list(obj)
-                    return super().default(obj)
-            json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder)
-
-    def __get_revision_hashes(self, peft_model_id: str):
-        ff_revision = None
-        ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt")
-        if os.path.exists(ff_revision_file):
-            ff_revision = "".join(open(ff_revision_file).read().split())
-
-        if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id):
-            # Local model
-            files = os.listdir(peft_model_id)
-            state = files + [
-                os.path.getmtime(os.path.join(peft_model_id, f)) for f in files
-            ]
-            latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
-        else:
-            # Remote HuggingFace model
-            hf_api = HfApi()
-            latest_revision = hf_api.model_info(self.peft_model_id).sha
-        return ff_revision, ff_revision_file, latest_revision
-
-    def convert_peft_model(self, hf_peft_model, weights_path):
-        for name, params in hf_peft_model.named_parameters():
-            if self.peft_type.lower() in name:
-                name = name.replace("base_model.model.model.", "").replace(
-                    ".default", ""
-                )
-                name = self.base_model.model_class.convert_hf_weight_name(name)
-                params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
-
-    def download_hf_weights_if_needed(self):
-        """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date.
-        If not, or if the refresh_cache parameter is set to True, download new weights.
-        """
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.peft_model_id.lower(),
-            "full-precision"
-            if self.data_type == DataType.DT_FLOAT
-            else "half-precision",
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..."
-            )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
-
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.peft_model_id
-        )
-
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.peft_model_id) or os.path.isdir(
-                self.peft_model_id
-            ):
-                # Local model
-                print(
-                    f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..."
-                )
-            # Download base model from HuggingFace, or load it from the local folder
-            self.base_model.download_hf_weights_if_needed()
-            self.base_model.download_hf_tokenizer_if_needed()
-            self.base_model.download_hf_config()
-            hf_base_model = AutoModelForCausalLM.from_pretrained(
-                self.hf_config.base_model_name_or_path,
-                return_dict=True,
-                trust_remote_code=True,
-                torch_dtype=torch.float32
-                if self.data_type == DataType.DT_FLOAT
-                else torch.float16,
-                # device_map="auto",
-            )
-            hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id)
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.peft_model_id) or os.path.isdir(
-                self.peft_model_id
-            ):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.convert_peft_model(hf_peft_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-            # Deallocate hf model
-            del hf_peft_model
-            del hf_base_model
-            gc.collect()
-            torch.cuda.empty_cache()
-        else:
-            print(f"Loading '{self.peft_model_id}' model weights from the cache...")
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 994a85f57e..ac6975b4fd 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -152,6 +152,9 @@ class OpType(Enum):
     RESIDUAL_RMS_NORM = 2305
     RESIDUAL_LAYERNORM = 2306
 
+class RequestType(Enum):
+    REQ_INFERENCE = 4001
+    REQ_FINETUNING = 4002
 
 def enum_to_int(enum, enum_item):
     for item in enum:
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 58acf3d010..cb8433c2c6 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -67,6 +67,8 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
 };
 
 Logger ffc_log("flexflow_c");
@@ -1542,6 +1544,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_,
+    const flexflow_lora_linear_config_t peft_config_) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
+  PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config);
+
+  DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, "
+              "peft_model_id: %p",
+              handle,
+              peft_config,
+              peft_model_id);
+  return FFCObjectWrapper::wrap(peft_model_id);
+}
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_,
                                       flexflow_sgd_optimizer_t optimizer_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1597,43 +1614,74 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
+                             enum RequestType *request_types,
                              char const **input_texts,
-                             int max_num_chars,
                              char **output_texts,
-                             int max_seq_length,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
                              int **output_length_and_tokens) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   std::vector<Request> requests;
+
+  int finetuning_req_idx = 0;
   for (int i = 0; i < num_requests; i++) {
-    std::string const text_str(input_texts[i]);
-    Request inference_req;
-    inference_req.prompt = text_str;
-    inference_req.max_sequence_length = max_seq_length;
-    requests.push_back(inference_req);
-    DEBUG_PRINT("[Model] generate[%d] %p %s %i",
-                i,
-                handle,
-                text_str.c_str(),
-                max_seq_length);
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      std::string const text_str(input_texts[i]);
+      Request inference_req;
+      inference_req.prompt = text_str;
+      inference_req.max_sequence_length = max_seq_lengths[i];
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
+        inference_req.peft_model_id = *peft_model_id;
+      }
+      requests.push_back(inference_req);
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i",
+                  i,
+                  handle,
+                  text_str.c_str(),
+                  max_seq_lengths[i]);
+    } else {
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = max_seq_lengths[i];
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
+        fine_tuning_req.peft_model_id = *peft_model_id;
+      }
+      std::string const dataset_fp(dataset_filepaths[finetuning_req_idx]);
+      fine_tuning_req.dataset_filepath = dataset_fp;
+      fine_tuning_req.max_training_steps = training_steps[finetuning_req_idx];
+      requests.push_back(fine_tuning_req);
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i %i",
+                  i,
+                  handle,
+                  dataset_fp.c_str(),
+                  max_seq_lengths[i],
+                  training_steps[finetuning_req_idx]);
+      finetuning_req_idx++;
+    }
   }
 
   std::vector<GenerationResult> results = handle->generate(requests);
 
-  // If the prompt exceeds max seq len, check that we return the prompt with no
-  // additional token. Otherwise, check that the output does not exceed the max
-  // sequence length.
   for (int i = 0; i < num_requests; i++) {
-    assert(results[i].output_tokens.size() <= max_seq_length ||
-           results[i].output_tokens.size() == results[i].input_tokens.size());
-    output_length_and_tokens[i][0] = results[i].output_tokens.size();
-    std::copy(results[i].output_tokens.begin(),
-              results[i].output_tokens.end(),
-              output_length_and_tokens[i] + 1);
-    std::memcpy(output_texts[i],
-                results[i].output_text.c_str(),
-                results[i].output_text.length());
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      // If the prompt exceeds max seq len, check that we return the prompt with
+      // no additional token. Otherwise, check that the output does not exceed
+      // the max sequence length.
+      assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
+             results[i].output_tokens.size() == results[i].input_tokens.size());
+      output_length_and_tokens[i][0] = results[i].output_tokens.size();
+      std::copy(results[i].output_tokens.begin(),
+                results[i].output_tokens.end(),
+                output_length_and_tokens[i] + 1);
+      std::memcpy(output_texts[i],
+                  results[i].output_text.c_str(),
+                  results[i].output_text.length());
+    }
   }
-  // return FFCObjectWrapper::wrap(&results[0]);
 }
 
 void flexflow_model_set_position_offset(flexflow_model_t handle_,
@@ -2739,3 +2787,50 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
   handle->load_weights(model);
 }
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_) {
+  assert(cache_folder_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(peft_model_id_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  std::string const cache_folder(cache_folder_);
+  std::string const peft_model_id(peft_model_id_);
+  LoraLinearConfig *handle = new LoraLinearConfig(cache_folder, peft_model_id);
+  DEBUG_PRINT("[LoraLinearConfig] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_lora_linear_config_destroy(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config);
+  delete peft_config;
+}
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create() {
+  PEFTModelID *handle = new PEFTModelID();
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
+  PEFTModelID *handle = new PEFTModelID(id);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
+  PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);
+  delete peft_model_id;
+}
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 574fbcb573..aca93a973d 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -266,8 +266,7 @@ __host__ void
                                                 batch_size);
         break;
       }
-      case OP_LORA_MLP_FIRST:
-      case OP_LORA_MLP_SECOND: {
+      case OP_LORA: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
         Domain input_domain = my_input_accessor[0].domain;
@@ -910,8 +909,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
                                                  num_peft_tokens);
         break;
       }
-      case OP_LORA_MLP_FIRST:
-      case OP_LORA_MLP_SECOND: {
+      case OP_LORA: {
         assert(fused->op_num_inputs[op] == 2);
         assert(fused->op_num_outputs[op] == 1);
         Domain input_domain = my_input_grad_accessor[0].domain;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 83fdbaf927..8b0776fde4 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1488,7 +1488,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 366eca27b7..170e087226 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -38,51 +38,132 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::LoraLinear;
 
-void FFModel::lora_linear(Tensor const input,
-                          Tensor const output,
-                          OperatorType op_type,
-                          char const *name) {
-  assert(input->data_type == output->data_type);
-  Layer *lora = nullptr;
-  lora = new Layer(this,
-                   op_type,
-                   output->data_type,
-                   name,
-                   2 /*inputs*/,
-                   0 /*weights*/,
-                   1 /*outputs*/,
-                   input,
-                   output);
-  {
-    int numdims = output->num_dims;
-    int dims[MAX_TENSOR_DIM];
-    for (int i = 0; i < numdims; i++) {
-      dims[i] = output->dims[i];
+bool check_lora_layer_match(Layer *potential_target,
+                            std::string target_module_name) {
+  if (potential_target->op_type == OP_LINEAR &&
+      potential_target->name != nullptr && strlen(potential_target->name) > 0) {
+    std::string s(potential_target->name);
+    if (s.find(target_module_name) != std::string::npos &&
+        s.find("lora") == std::string::npos) {
+      return true;
     }
-    lora->outputs[0] = create_tensor_legion_ordering(
-        numdims, dims, output->data_type, lora, 0, true /*create_grad*/);
   }
-  layers.push_back(lora);
+  return false;
+}
+
+PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
+  if (peft_config.target_modules.size() == 0) {
+    printf("PEFT config does not contain any target module\n");
+    return nullptr;
+  }
+  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
+  peft_configs[*peft_model_id] = peft_config;
+
+  for (std::string target_module_name : peft_config.target_modules) {
+    assert(target_module_name.length() > 0 &&
+           "LoRA target module name is empty");
+    // find target layer
+    for (auto it = layers.begin(); it != layers.end(); ++it) {
+      Layer *target_module = *it;
+      bool match = check_lora_layer_match(target_module, target_module_name);
+      if (!match) {
+        continue;
+      }
+
+      if (base_layer_to_peft_layer.find(target_module) !=
+          base_layer_to_peft_layer.end()) {
+        // lora linear layer already added, no need to add again
+        Layer *peft_layer = base_layer_to_peft_layer[target_module];
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+      } else {
+        Tensor const input = target_module->inputs[0];
+        Tensor const output = target_module->outputs[0];
+        assert(input->data_type == output->data_type);
+        std::string name_ = target_module->name
+                                ? std::string(target_module->name)
+                                : std::string("");
+        size_t last_underscore = name_.length() - 1;
+        for (int i = name_.length() - 1; i > 0; i--) {
+          if (!(std::isdigit(target_module->name[i]) ||
+                target_module->name[i] == '_')) {
+            break;
+          } else if (target_module->name[i] == '_') {
+            last_underscore = i;
+          }
+        }
+        name_.erase(last_underscore);
+
+        name_ += ".lora";
+        std::cout << "Adding layer " << name_ << std::endl;
+        Layer *peft_layer = new Layer(this,
+                                      OP_LORA,
+                                      output->data_type,
+                                      name_.c_str(),
+                                      2 /*inputs*/,
+                                      0 /*weights*/,
+                                      1 /*outputs*/,
+                                      input,
+                                      output);
+        {
+          int numdims = output->num_dims;
+          int dims[MAX_TENSOR_DIM];
+          for (int i = 0; i < numdims; i++) {
+            dims[i] = output->dims[i];
+          }
+          peft_layer->outputs[0] =
+              create_tensor_legion_ordering(numdims,
+                                            dims,
+                                            output->data_type,
+                                            peft_layer,
+                                            0,
+                                            true /*create_grad*/);
+        }
+        layers.insert(it + 1, peft_layer);
+        ++it;
+        base_layer_to_peft_layer[target_module] = peft_layer;
+        peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+      }
+    }
+  }
+
+  return peft_model_id;
 }
 
 Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
+  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
+  std::vector<PEFTModelID> const &peft_ids =
+      model.peft_layer_to_peft_id[(Layer *)layer];
+  for (int i = 0; i < peft_ids.size(); i++) {
+    _peft_configs.emplace(
+        std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
+  }
   return new LoraLinear(model,
                         layer->layer_guid,
                         layer->op_type,
                         inputs[0],
                         inputs[1],
+                        _peft_configs,
                         layer->name);
+  ;
 }
 
 LoraLinear::LoraLinear(FFModel &model,
                        LoraLinear const &other,
                        ParallelTensor const input,
                        ParallelTensor const output)
-    : LoraLinear(
-          model, other.layer_guid, other.op_type, input, output, other.name) {}
+    : LoraLinear(model,
+                 other.layer_guid,
+                 other.op_type,
+                 input,
+                 output,
+                 other.peft_configs,
+                 other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
                        Params const &params,
@@ -93,14 +174,17 @@ LoraLinear::LoraLinear(FFModel &model,
                  params.type,
                  inputs.first,
                  inputs.second,
+                 params.peft_configs,
                  params.name) {}
 
-LoraLinear::LoraLinear(FFModel &model,
-                       LayerID const &_layer_guid,
-                       OperatorType _op_type,
-                       ParallelTensor const _input,
-                       ParallelTensor const _output,
-                       char const *name)
+LoraLinear::LoraLinear(
+    FFModel &model,
+    LayerID const &_layer_guid,
+    OperatorType _op_type,
+    ParallelTensor const _input,
+    ParallelTensor const _output,
+    std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+    char const *name)
     : Op(model,
          _op_type,
          _output->data_type,
@@ -129,6 +213,9 @@ LoraLinear::LoraLinear(FFModel &model,
     outputs[0] = model.create_parallel_tensor_legion_ordering(
         numdim, dims, inputs[1]->data_type, this);
   }
+  for (auto const &kv : _peft_configs) {
+    peft_configs.insert(kv);
+  }
   // assert(check_output_input_weight_parallel_dims(allocate_weights));
 }
 
@@ -183,6 +270,32 @@ void LoraLinear::init_inference(
   set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
 }
 
+template <typename DT>
+void load_peft_from_file(
+    DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+  std::vector<DT> host_array(size);
+  size_t target_data_size = sizeof(DT) * size;
+  in.seekg(sharded * shard_id * target_data_size, in.beg);
+  in.read((char *)host_array.data(), target_data_size);
+
+  size_t in_get_size = in.gcount();
+  if (in_get_size != target_data_size) {
+    printf("load weight data error: %lu, %lu, %lu\n",
+           in_get_size,
+           target_data_size,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(size == host_array.size());
+  copy_tensor_host_to_dev(ptr, host_array.data(), size);
+  in.close();
+}
+
 /*
   regions[0](O): output
   regions[1](I): kernel
@@ -219,97 +332,12 @@ OpMeta *LoraLinear::init_task(Task const *task,
   std::strcpy(m->op_name, lora->name);
   m->layer_guid = lora->layer_guid;
 
-  return m;
-}
-
-struct LoraLinearRegisterInfo {
-  LoraLinear const *lora;
-  PEFTModelID model_id;
-  LoraLinearConfig lora_config;
-};
-
-void LoraLinear::register_peft_model(
-    FFModel const &ff,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    PEFTModelID const &model_id,
-    LoraLinearConfig const lora_config) {
-  assert(check_output_input_weight_same_parallel_is());
-  assert(batch_inputs.size() == 2);
-  assert(batch_outputs.size() == 1);
-  // Assert that the output and the second input are mapped to the same
-  // region/part
-  assert(batch_outputs[0]->region == batch_inputs[1]->region);
-  assert(batch_outputs[0]->part == batch_inputs[1]->part);
-  // assert(check_output_input_weight_same_machine_view());
-  // output is considered as an input to allow in-place optimization
-  ParallelTensor output_tensor = batch_outputs[0];
-  parallel_is = output_tensor->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = &output_tensor->machine_view;
-  size_t machine_view_hash = view->hash();
-  set_argumentmap_for_inference(ff, argmap, output_tensor);
-  LoraLinearRegisterInfo info;
-  info.lora = this;
-  info.model_id = model_id;
-  info.lora_config = lora_config;
-  IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID,
-                         parallel_is,
-                         TaskArgument(&info, sizeof(LoraLinearRegisterInfo)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-}
-
-template <typename DT>
-void load_peft_from_file(
-    DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) {
-  std::ifstream in(filepath, std::ios::in | std::ios::binary);
-  if (!in.good()) {
-    printf("Could not open file: %s\n", filepath.c_str());
-  }
-  assert(in.good() && "incorrect weight file path");
-  std::vector<DT> host_array(size);
-  size_t target_data_size = sizeof(DT) * size;
-  in.seekg(sharded * shard_id * target_data_size, in.beg);
-  in.read((char *)host_array.data(), target_data_size);
-
-  size_t in_get_size = in.gcount();
-  if (in_get_size != target_data_size) {
-    printf("load weight data error: %lu, %lu, %lu\n",
-           in_get_size,
-           target_data_size,
-           sizeof(DT));
-    assert(false);
-  }
-  assert(size == host_array.size());
-  copy_tensor_host_to_dev(ptr, host_array.data(), size);
-  in.close();
-}
-
-void LoraLinear::register_model_task(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  LoraLinearRegisterInfo const *info =
-      static_cast<LoraLinearRegisterInfo const *>(task->args);
-  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
-  LoraLinear const *lora = info->lora;
-
   int shard_id = task->index_point.point_data[0];
-
-  int rank = info->lora_config.rank;
   int num_dims = lora->inputs[0]->num_dims;
-  int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree;
-  int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree;
-  int w0_num_elements = rank * in_dim;
-  int w1_num_elements = rank * out_dim;
+  assert(in_dim ==
+         lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree);
+  assert(out_dim ==
+         lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree);
 
   DataType dt = m->input_type[0];
   assert(dt == m->input_type[1]);
@@ -317,17 +345,6 @@ void LoraLinear::register_model_task(Task const *task,
   assert(dt == lora->inputs[0]->data_type);
   assert(dt == lora->inputs[1]->data_type);
   assert(dt == lora->outputs[0]->data_type);
-  assert(m->model_weights.find(info->model_id) == m->model_weights.end());
-
-  LoraLinearWeight weight;
-  weight.in_dim = in_dim;
-  weight.out_dim = out_dim;
-  weight.rank = rank;
-  PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
-  weight.w0_ptr = allocator->allocate_local_weights_untyped(
-      info->model_id, w0_num_elements * data_type_size(dt));
-  weight.w1_ptr = allocator->allocate_local_weights_untyped(
-      info->model_id, w1_num_elements * data_type_size(dt));
 
   // get layer name
   assert(lora->name != nullptr &&
@@ -344,61 +361,87 @@ void LoraLinear::register_model_task(Task const *task,
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
 
-  // load weights from file
-  std::string weights_folder_filepath = join_path({
-      info->lora_config.cache_folder,
-      "weights",
-      info->lora_config.peft_model_id,
-      dt == DT_FLOAT ? "full-precision" : "half-precision",
-  });
-  std::string w0_filepath =
-      join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"});
-  std::string w1_filepath =
-      join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"});
-  if (dt == DT_FLOAT) {
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
-              << ", size: " << w0_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
-              << ", size: " << w1_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
-  } else if (dt == DT_HALF) {
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight"
-              << ", size: " << w0_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
-    std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight"
-              << ", size: " << w1_num_elements << ", shard: " << shard_id
-              << std::endl;
-    load_peft_from_file(
-        (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
-  } else {
-    assert(false && "Data type not supported");
-  }
+  for (auto const &kv : lora->peft_configs) {
+    PEFTModelID const &model_id = kv.first;
+    LoraLinearConfig const &lora_config = kv.second;
+
+    int rank = lora_config.rank;
+
+    int w0_num_elements = rank * in_dim;
+    int w1_num_elements = rank * out_dim;
+
+    LoraLinearWeight weight;
+    weight.in_dim = in_dim;
+    weight.out_dim = out_dim;
+    weight.rank = rank;
+    PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
+    weight.w0_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w0_num_elements * data_type_size(dt));
+    weight.w1_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w1_num_elements * data_type_size(dt));
+
+    // load weights from file
+    std::string weights_folder_filepath = join_path({
+        lora_config.cache_folder,
+        "weights",
+        lora_config.peft_model_id,
+        dt == DT_FLOAT ? "full-precision" : "half-precision",
+    });
+    std::string w0_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_A.weight"});
+    std::string w1_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_B.weight"});
+    if (dt == DT_FLOAT) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
+                << ", size: " << w0_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
+                << ", size: " << w1_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file((float *)weight.w1_ptr,
+                          w1_num_elements,
+                          false,
+                          shard_id,
+                          w1_filepath);
+    } else if (dt == DT_HALF) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
+                << ", size: " << w0_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
+                << ", size: " << w1_num_elements << ", shard: " << shard_id
+                << std::endl;
+      load_peft_from_file(
+          (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath);
+    } else {
+      assert(false && "Data type not supported");
+    }
 
-  if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
-    // Input is partitioned (no replication)
-    // w0_grad is local weight gradients
-    weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
-        info->model_id, w0_num_elements * data_type_size(dt));
-    // w1_grad is sync weight gradients
-    weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
-        info->model_id, w1_num_elements * data_type_size(dt));
-  } else {
-    // Input is replicated
-    // w0_grad is sync weight gradients
-    weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
-        info->model_id, w0_num_elements * data_type_size(dt));
-    // w1_grad is local weight gradients
-    weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
-        info->model_id, w1_num_elements * data_type_size(dt));
+    if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+      // Input is partitioned (no replication)
+      // w0_grad is local weight gradients
+      weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
+          model_id, w0_num_elements * data_type_size(dt));
+      // w1_grad is sync weight gradients
+      weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
+          model_id, w1_num_elements * data_type_size(dt));
+    } else {
+      // Input is replicated
+      // w0_grad is sync weight gradients
+      weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
+          model_id, w0_num_elements * data_type_size(dt));
+      // w1_grad is local weight gradients
+      weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
+          model_id, w1_num_elements * data_type_size(dt));
+    }
+    assert(m->model_weights.find(model_id) == m->model_weights.end());
+    m->model_weights[model_id] = weight;
   }
-  m->model_weights[info->model_id] = weight;
+
+  return m;
 }
 
 void LoraLinear::forward(FFModel const &ff) {
@@ -761,7 +804,17 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type;
+  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type &&
+      lhs.peft_configs.size() == rhs.peft_configs.size()) {
+    for (auto const &kv : lhs.peft_configs) {
+      auto it = rhs.peft_configs.find(kv.first);
+      if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
 }
 
 void LoraLinear::serialize(Legion::Serializer &sez) const {
@@ -769,6 +822,19 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->op_type);
+  sez.serialize(this->peft_configs.size());
+  for (auto const &kv : this->peft_configs) {
+    // Serialize PEFTModelID
+    sez.serialize(kv.first.id);
+    // Serialize LoraConfig's cache folder
+    sez.serialize(kv.second.cache_folder.length());
+    sez.serialize(kv.second.cache_folder.c_str(),
+                  kv.second.cache_folder.length());
+    // Serialize LoraConfig's peft model id
+    sez.serialize(kv.second.peft_model_id.length());
+    sez.serialize(kv.second.peft_model_id.c_str(),
+                  kv.second.peft_model_id.length());
+  }
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -782,17 +848,45 @@ Node LoraLinear::deserialize(FFModel &ff,
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
   OperatorType op_type;
+  size_t num_pefts;
   size_t name_len;
   char name[MAX_OPNAME] = {0};
+
+  LoraLinearParams params;
+
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
   dez.deserialize(op_type);
+  dez.deserialize(num_pefts);
+  for (int i = 0; i < num_pefts; i++) {
+    // Deserialize PEFTModelID
+    size_t pid;
+    dez.deserialize(pid);
+    PEFTModelID peft_model_id(pid);
+
+    // Deserialize LoraConfig's cache folder
+    size_t string_size;
+    char buffer[4096] = {0};
+    dez.deserialize(string_size);
+    dez.deserialize(buffer, string_size);
+    std::string cache_folder = std::string(buffer);
+
+    // Deserialize LoraConfig's peft model id
+    string_size = 0;
+    memset(buffer, 0, 4096);
+    dez.deserialize(string_size);
+    dez.deserialize(buffer, string_size);
+    std::string peft_model_name = std::string(buffer);
+
+    LoraLinearConfig lora_linear_config(cache_folder, peft_model_name);
+    params.peft_configs.emplace(
+        std::make_pair(peft_model_id, lora_linear_config));
+  }
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
-  LoraLinearParams params;
   params.layer_guid = layer_guid;
   params.type = op_type;
   strcpy(params.name, name);
@@ -813,6 +907,7 @@ LoraLinearParams LoraLinear::get_params() const {
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
+  params.peft_configs = this->peft_configs;
   return params;
 }
 
@@ -831,6 +926,18 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
+  for (auto const &kv : params.peft_configs) {
+    hash_combine(key, kv.first.id);
+    hash_combine(key, kv.second.rank);
+    hash_combine(key, kv.second.optimizer_type);
+    hash_combine(key, kv.second.learning_rate);
+    hash_combine(key, kv.second.cache_folder);
+    hash_combine(key, kv.second.peft_model_id);
+    hash_combine(key, kv.second.lora_alpha);
+    hash_combine(key, kv.second.lora_dropout);
+    hash_combine(key, kv.second.target_modules);
+    hash_combine(key, kv.second.load_weights_from_file);
+  }
   return key;
 }
 }; // namespace std
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 9d797aaed2..1b142d5577 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -5,7 +5,7 @@
 using json = nlohmann::json;
 
 namespace FlexFlow {
-const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig();
+const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig();
 
 LoraLinearConfig::LoraLinearConfig()
     : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f),
@@ -31,6 +31,9 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
       rank = model_config["r"];
       lora_alpha = model_config["lora_alpha"];
       lora_dropout = model_config["lora_dropout"];
+      for (auto &s : model_config["target_modules"]) {
+        target_modules.push_back(s);
+      }
     } catch (json::exception const &e) {
       std::cerr << "Error parsing PEFT config from JSON file: " << e.what()
                 << std::endl;
@@ -48,14 +51,25 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_,
 
 bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
   if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type &&
-      lhs.learning_rate == rhs.learning_rate) {
+      lhs.learning_rate == rhs.learning_rate &&
+      lhs.cache_folder == rhs.cache_folder &&
+      lhs.peft_model_id == rhs.peft_model_id &&
+      lhs.lora_alpha == rhs.lora_alpha &&
+      lhs.lora_dropout == rhs.lora_dropout &&
+      lhs.target_modules.size() == rhs.target_modules.size() &&
+      lhs.load_weights_from_file == rhs.load_weights_from_file) {
+    for (int i = 0; i < lhs.target_modules.size(); i++) {
+      if (lhs.target_modules[i] != rhs.target_modules[i]) {
+        return false;
+      }
+    }
     return true;
   }
   return false;
 }
 
 std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
-  os << "LoraLinearConfig: ";
+  os << "LoraLinearConfig: {";
   os << "rank: " << llc.rank << ", ";
   os << "optimizer_type: " << llc.optimizer_type << ", ";
   os << "learning_rate: " << llc.learning_rate << ", ";
@@ -63,6 +77,14 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "peft_model_id: " << llc.peft_model_id << ", ";
   os << "lora_alpha: " << llc.lora_alpha << ", ";
   os << "lora_dropout: " << llc.lora_dropout << ", ";
+  os << "target_modules: [";
+  for (int i = 0; i < llc.target_modules.size(); i++) {
+    os << llc.target_modules[i];
+    if (i < llc.target_modules.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << "], ";
   os << "load_weights_from_file: " << llc.load_weights_from_file << std::endl;
   return os;
 }
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 3ee1ee62df..33e11bf451 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -189,10 +189,8 @@ std::string get_operator_type_name(OperatorType type) {
     case OP_ARGMAX:
       return "ArgMax";
     // PEFT Ops
-    case OP_LORA_MLP_FIRST:
-      return "Lora MLP First Layer";
-    case OP_LORA_MLP_SECOND:
-      return "Lora MLP Second Layer";
+    case OP_LORA:
+      return "Lora Layer";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index fa19c9b22d..84554c2bd4 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string q_file = layer_name + "_wq_bias";
-  std::string k_file = layer_name + "_wk_bias";
-  std::string v_file = layer_name + "_wv_bias";
+  std::string q_file = layer_name + ".q_proj.bias";
+  std::string k_file = layer_name + ".k_proj.bias";
+  std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
   if (final_bias) {
-    std::string o_file = layer_name + "_wo_bias";
+    std::string o_file = layer_name + ".o_proj.bias";
     bias_files.push_back(o_file);
   }
 
@@ -217,12 +217,10 @@ void load_attention_weights_v2(DT *ptr,
                                std::string weights_folder,
                                size_t volume,
                                int tensor_parallelism_degree) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
@@ -407,12 +405,10 @@ void load_attention_weights_quantized(char *ptr,
                                       std::string weights_folder,
                                       DataType data_type,
                                       bool use_full_precision) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
 
   int file_index = 0;
@@ -690,7 +686,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     if (weight_idx > 0) {
       assert(weight_idx == 0 || weight_idx == 1);
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
     load_from_quantized_file(data,
@@ -728,44 +724,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-    if (weight_filename.find("self_attention") != std::string::npos) {
-      load_attention_weights_multi_query(
-          data, weight_filename, weights_folder, hidden_dim, num_heads);
-    } else if (weight_filename.find("attention") != std::string::npos &&
-               weight_filename.rfind("attention") ==
-                   weight_filename.length() - strlen("attention")) {
-      if (weight_idx == 0) {
-        load_attention_weights_v2(data,
-                                  num_heads,
-                                  num_kv_heads,
-                                  hidden_dim,
-                                  qkv_inner_dim,
-                                  weight_filename,
-                                  weights_folder,
-                                  volume,
-                                  tensor_parallelism_degree);
-      } else {
-        long long value;
-        l->get_int_property("final_bias", value);
-        bool final_bias = (bool)value;
-        load_attention_bias_v2(data,
-                               num_heads,
-                               num_kv_heads,
-                               hidden_dim,
-                               qkv_inner_dim,
-                               final_bias,
-                               weight_filename,
-                               weights_folder);
-      }
-
+    if (weight_idx == 0) {
+      load_attention_weights_v2(data,
+                                num_heads,
+                                num_kv_heads,
+                                hidden_dim,
+                                qkv_inner_dim,
+                                weight_filename,
+                                weights_folder,
+                                volume,
+                                tensor_parallelism_degree);
     } else {
-      assert(false);
+      long long value;
+      l->get_int_property("final_bias", value);
+      bool final_bias = (bool)value;
+      load_attention_bias_v2(data,
+                             num_heads,
+                             num_kv_heads,
+                             hidden_dim,
+                             qkv_inner_dim,
+                             final_bias,
+                             weight_filename,
+                             weights_folder);
     }
   } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
     assert(weight_idx >= 0 || weight_idx <= 2);
     weight_filename += (weight_idx == 0)
-                           ? "_attn_bias"
-                           : ((weight_idx == 1) ? "_weight" : "_bias");
+                           ? ".attn_bias"
+                           : ((weight_idx == 1) ? ".weight" : ".bias");
     std::cout << "Loading weight file " << weight_filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, weight_filename});
     load_from_file(data, volume, weight_filepath);
@@ -774,7 +760,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     assert(weight_idx == 0 || weight_idx == 1);
     // handle exception
     if (weight_filename != "embed_tokens_weight_lm_head") {
-      weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+      weight_filename += weight_idx == 0 ? ".weight" : ".bias";
     }
     std::cout << "Loading weight file " << weight_filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, weight_filename});
@@ -801,7 +787,7 @@ void FileDataLoader::load_weights(FFModel *ff) {
         continue;
       }
       // TODO: currently skip Lora layers
-      if (l->op_type == OP_LORA_MLP_FIRST || l->op_type == OP_LORA_MLP_SECOND) {
+      if (l->op_type == OP_LORA) {
         continue;
       }
       switch (weight->data_type) {
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 31cf3bb6a7..dae0021bb6 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2764,8 +2764,7 @@ void FFModel::deserialize_graph_optimal_view(
         node = Linear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
-      case OP_LORA_MLP_FIRST:
-      case OP_LORA_MLP_SECOND: {
+      case OP_LORA: {
         node = LoraLinear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 91a6dab9b5..212d0ebf6b 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -54,10 +54,31 @@ bool parallel_tensor_list_overlaps(std::vector<ParallelTensor> const &list1,
 }
 
 void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
+
+  // Check if the model object exists
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl;
+  }
+
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
   model->config.batchSize = BatchConfig::max_tokens_per_batch();
+
+  // Check if the model object exists after importing config
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist after "
+                 "setting config and batch size."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl;
+  }
+
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
@@ -609,17 +630,26 @@ void FFModel::set_position_offset(int offset) {
 }
 
 void FFModel::compile_inference() {
+  std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl;
+
   // Request at least four CPU processors for inference runs
   assert(
       config.cpusPerNode >= 4 &&
       "FlexFlow Serve requires at least four CPU cores per node, please add "
       "`-ll:cpu 4` in the command line if you are using the C++ interface or "
       "set `num_cpus` in `ff.init` if you are using the Python interface");
+
+  std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four "
+               "CPU cores per node."
+            << std::endl;
   Context ctx = config.lg_ctx;
   Runtime *runtime = config.lg_hlr;
   config.computationMode = COMP_MODE_INFERENCE;
   create_operators_from_layers();
+
   // Launch the graph optimize task
+  std::cout << "###PEFT DEBUGGING### Launching graph optimization task."
+            << std::endl;
   {
     FFModel *model = this;
     TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID,
@@ -670,6 +700,14 @@ void FFModel::compile_inference() {
       }
     }
   }
+
+  std::cout
+      << "###PEFT DEBUGGING### Operators reconstructed from optimized graph."
+      << std::endl;
+  // Perform inplace optimizations
+  std::cout << "###PEFT DEBUGGING### Starting inplace optimizations."
+            << std::endl;
+
   loss_op = nullptr;
   metrics_op = nullptr;
   // Perform inplace optimizations
@@ -709,6 +747,8 @@ void FFModel::compile_inference() {
     }
   }
 
+  // Output tensor mapping
+  std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     Op *op = operators[l];
 
@@ -734,6 +774,8 @@ void FFModel::compile_inference() {
   }
 
 #ifdef FF_USE_NCCL
+  std::cout << "###PEFT DEBUGGING### Setting up NCCL communications."
+            << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     // Only create nccl for allreduce and fusedop for inference
     // (fusedop may include allreduces)
@@ -770,6 +812,8 @@ void FFModel::compile_inference() {
     }
   }
 #endif
+  std::cout << "###PEFT DEBUGGING### compile_inference completed successfully."
+            << std::endl;
 }
 
 std::string join_path(std::vector<std::string> const &paths) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index a64fb8ec9c..63016d0c8b 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3308,8 +3308,7 @@ Op *FFModel::create_operator_from_layer(
       return op;
     }
     // PEFT layers
-    case OP_LORA_MLP_FIRST:
-    case OP_LORA_MLP_SECOND: {
+    case OP_LORA: {
       Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
       return op;
@@ -6697,22 +6696,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
-  {
-    TaskVariantRegistrar registrar(LORA_LINEAR_REG_TASK_ID,
-                                   "LoraLinear Model Registration");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<LoraLinear::register_model_task>(
-          registrar, "LoraLinear Model Registration Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<LoraLinear::register_model_task>(
-          registrar);
-    }
-  }
   {
     TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID,
                                    "LoraLinear Inference");
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 41c371d4e2..9dc0361316 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -23,6 +23,7 @@
 #include <future>
 #include <iomanip>
 #include <new>
+#include <nlohmann/json.hpp>
 #include <stack>
 #include <stdexcept>
 
@@ -30,6 +31,7 @@ namespace FlexFlow {
 
 using namespace Legion;
 using tokenizers::Tokenizer;
+using json = nlohmann::json;
 
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
@@ -45,6 +47,48 @@ std::string LoadBytesFromFile(std::string const &path) {
   return data;
 }
 
+std::ostream &operator<<(std::ostream &os, Request const &req) {
+  os << "Request {\n";
+  os << "  guid: " << req.guid << "\n";
+  os << "  peft_model_id: " << req.peft_model_id << "\n";
+  os << "  max_sequence_length: " << req.max_sequence_length << "\n";
+  os << "  initial_len: " << req.initial_len << "\n";
+  os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
+  os << "  llm_cache_size: " << req.llm_cache_size << "\n";
+  os << "  status: " << static_cast<int>(req.status) << "\n";
+  os << "  tokens: [";
+  for (auto const &token : req.tokens) {
+    os << token << " ";
+  }
+  os << "]\n";
+  os << "  prompt: " << req.prompt << "\n";
+  // os << "  beam_trees: [";
+  // for (const auto& tree : req.beam_trees) {
+  //     // Assuming BeamTree has its own << operator defined
+  //     os << tree << " ";
+  // }
+  // os << "]\n";
+  os << "  req_type: " << static_cast<int>(req.req_type) << "\n";
+  os << "  completed_training_steps: " << req.completed_training_steps << "\n";
+  os << "  max_training_steps: " << req.max_training_steps << "\n";
+  os << "  dataset_filepath: " << req.dataset_filepath << "\n";
+  os << "  dataset: [";
+  for (auto const &pair : req.dataset) {
+    os << "[";
+    for (auto const &token : pair.first) {
+      os << token << " ";
+    }
+    os << "], [";
+    for (auto const &token : pair.second) {
+      os << token << " ";
+    }
+    os << "] ";
+  }
+  os << "]\n";
+  os << "}\n";
+  return os;
+}
+
 RequestManager::RequestManager()
     : request_manager_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -240,19 +284,32 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
+  request.initial_len = 0;
   request.max_sequence_length = request_.max_sequence_length;
   request.peft_model_id = request_.peft_model_id;
-  request.req_type = Request::REQ_FINETUNING;
+  request.req_type = RequestType::REQ_FINETUNING;
   request.completed_training_steps = 0;
-  request.max_training_steps = 1; // TODO: let user set this
-  for (auto const &sample : request_.dataset_text) {
+  request.max_training_steps = request_.max_training_steps;
+  request.dataset_filepath = request_.dataset_filepath;
+
+  // Load dataset
+  using json = nlohmann::json;
+  std::ifstream file_handle(request.dataset_filepath);
+  assert(file_handle.good() && "Dataset file does not exist.");
+  json dataset_json = json::parse(file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+
+  for (auto &prompt : dataset_json) {
+    std::string text = prompt.get<std::string>();
+    std::string output_text("");
     std::vector<int32_t> input_tokens;
-    input_tokens = this->tokenizer_->Encode(sample.first);
+    input_tokens = this->tokenizer_->Encode(text);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
       input_tokens.insert(input_tokens.begin(), bos_token_id);
     }
-    std::vector<int32_t> output_tokens =
-        this->tokenizer_->Encode(sample.second);
+    std::vector<int32_t> output_tokens = this->tokenizer_->Encode(output_text);
     if (input_tokens.size() + output_tokens.size() >
         get_max_sequence_length()) {
       std::cout << "Warning: too many tokens in sample, only load up to "
@@ -373,7 +430,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
-    if (request.req_type == Request::REQ_FINETUNING) {
+    if (request.req_type == RequestType::REQ_FINETUNING) {
       // No new tokens generated when in fine-tuning mode
       continue;
     } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 <
@@ -403,7 +460,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
 
-      if (request.req_type == Request::REQ_FINETUNING) {
+      if (request.req_type == RequestType::REQ_FINETUNING) {
         // fine-tuning requests don't automatically carry over to the next
         // batch, we only do so if there is space left after adding new
         // inference requests
@@ -412,6 +469,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         if (request.completed_training_steps == request.max_training_steps) {
           // check if the fine tuning request has completed
           request.status = Request::COMPLETED;
+          trigger_request_completion_future(request.guid);
           log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)",
                             old_bc.requestsInfo[i].request_guid,
                             request.completed_training_steps);
@@ -562,7 +620,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
-        assert(new_request.req_type == Request::REQ_INFERENCE);
+        assert(new_request.req_type == RequestType::REQ_INFERENCE);
         pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
@@ -604,9 +662,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   // Step 4: add PEFT bwd requests, if there is additional space
   while (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
-    assert(request.req_type = Request::REQ_FINETUNING);
+    assert(request.req_type = RequestType::REQ_FINETUNING);
     Request &all_req_handle = all_requests[request.guid];
-    assert(all_req_handle.req_type = Request::REQ_FINETUNING);
+    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
     if (all_req_handle.status == Request::COMPLETED) {
       pending_peft_request_queue.pop();
     } else {
@@ -615,11 +673,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
   if (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
-    assert(request.req_type = Request::REQ_FINETUNING);
+    assert(request.req_type = RequestType::REQ_FINETUNING);
     assert(request.dataset.size() > 0);
     // update status and training steps
     Request &all_req_handle = all_requests[request.guid];
-    assert(all_req_handle.req_type = Request::REQ_FINETUNING);
+    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
     request.completed_training_steps = all_req_handle.completed_training_steps;
     request.status = all_req_handle.status;
     assert(request.status != Request::COMPLETED);
@@ -2410,7 +2468,12 @@ std::vector<GenerationResult>
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
   for (int i = 0; i < requests.size(); i++) {
-    RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i));
+    RequestManager::RequestGuid guid;
+    if (requests.at(i).req_type == RequestType::REQ_INFERENCE) {
+      guid = rm->register_new_request(requests.at(i));
+    } else {
+      guid = rm->register_new_peft_request(requests.at(i));
+    }
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }
@@ -2450,6 +2513,18 @@ void RequestManager::background_serving_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
+
+  auto print_timestamped_message = [](std::string const &message) {
+    auto now =
+        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - "
+              << message << std::endl;
+  };
+
+  // Print at the start of the task
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Starting background serving task.");
+
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
   {
@@ -2466,6 +2541,11 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
+
+  // Checkpoint print
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Updated models' configuration.");
+
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
     rm->serve_incr_decoding(llm);
@@ -2473,6 +2553,10 @@ void RequestManager::background_serving_task(
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+
+  // Print at the end of the task
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Background serving task completed.");
 }
 
 std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
@@ -2488,106 +2572,25 @@ std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
 
 bool is_peft_operator_type(OperatorType type) {
   switch (type) {
-    case OP_LORA_MLP_FIRST:
-    case OP_LORA_MLP_SECOND:
+    case OP_LORA:
       return true;
     default:
       return false;
   }
 }
 
-PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first,
-                                         LoraLinearConfig const mlp_second) {
-  if (!(mlp_first == LoraLinearConfig::DefaultConfig &&
-        mlp_second == LoraLinearConfig::DefaultConfig)) {
-    if (!config.enable_peft) {
-      fprintf(stderr,
-              "Error: trying to register PEFT model, but peft mode is not "
-              "enabled.\n");
-      assert(false);
-    }
-  }
-  PEFTModelID peft_model_id(peft_model_global_guid++);
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  std::vector<Op *> peft_operators;
-  for (size_t op = 0; op < operators.size(); op++) {
-    if (is_peft_operator_type(operators[op]->op_type)) {
-      peft_operators.push_back(operators[op]);
-    } else if (operators[op]->op_type == OP_FUSED) {
-      FusedOp *fused = static_cast<FusedOp *>(operators[op]);
-      for (size_t op2 = 0; op2 < fused->numOperators; op2++) {
-        if (is_peft_operator_type(fused->operators[op2]->op_type)) {
-          peft_operators.push_back(fused->operators[op2]);
-        }
-      }
-    }
-  }
-  for (size_t op = 0; op < peft_operators.size(); op++) {
-    std::string layer_name =
-        find_layer_name_from_guid(this, peft_operators[op]->layer_guid);
-    switch (peft_operators[op]->op_type) {
-      case OP_LORA_MLP_FIRST: {
-        if (mlp_first == LoraLinearConfig::DefaultConfig) {
-          // Do nothing for the default configuration
-          continue;
-        }
-        LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
-        // Currently assume only a single data pipeline
-        assert(config.data_parallelism_degree == 1);
-        std::vector<ParallelTensor> inputs(lora->numInputs);
-        std::vector<ParallelTensor> outputs(lora->numOutputs);
-
-        for (int i = 0; i < lora->numInputs; i++) {
-          assert(im->tensor_buffer.find(lora->inputs[i]) !=
-                 im->tensor_buffer.end());
-          assert(lora->inputs[i] != nullptr);
-          assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-          assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
-          inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
-          assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-        }
-        assert(lora->numOutputs == 1);
-        outputs[0] = inputs[1];
-        lora->register_peft_model(
-            *this, inputs, outputs, peft_model_id, mlp_first);
-        break;
-      }
-      case OP_LORA_MLP_SECOND: {
-        if (mlp_second == LoraLinearConfig::DefaultConfig) {
-          // Do nothing for the default configuration
-          continue;
-        }
-        LoraLinear *lora = static_cast<LoraLinear *>(peft_operators[op]);
-        // Currently assume only a single data pipeline
-        assert(config.data_parallelism_degree == 1);
-        std::vector<ParallelTensor> inputs(lora->numInputs);
-        std::vector<ParallelTensor> outputs(lora->numOutputs);
-
-        for (int i = 0; i < lora->numInputs; i++) {
-          assert(im->tensor_buffer.find(lora->inputs[i]) !=
-                 im->tensor_buffer.end());
-          assert(lora->inputs[i] != nullptr);
-          assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-          assert(im->tensor_buffer[lora->inputs[i]].size() == 1);
-          inputs[i] = im->tensor_buffer[lora->inputs[i]][0];
-          assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
-        }
-        assert(lora->numOutputs == 1);
-        outputs[0] = inputs[1];
-        lora->register_peft_model(
-            *this, inputs, outputs, peft_model_id, mlp_second);
-        break;
-      }
-      default: {
-        assert(false && "Unsupported PEFT Operator type");
-      }
-    }
-  }
-  return peft_model_id;
-}
-
 /*static*/
 void RequestManager::serve_incr_decoding(FFModel *llm) {
+
+  // Check if the model object exists
+  if (llm == nullptr) {
+    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl;
+  }
+
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;
   // Compile the llm
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 1fde4d5a50..7bfc560cc2 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -1,6 +1,6 @@
 import argparse
 import torch
-import os, sys, shutil
+import os, sys, shutil, json
 from peft import PeftModel, PeftConfig
 from transformers import (
     AutoModelForCausalLM,
@@ -40,11 +40,12 @@ def peft_post_forward_hook(module, input, output):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama")
+    parser.add_argument("--peft-model-id", type=str, required=True)
     parser.add_argument(
         "--use-full-precision", action="store_true", help="Use full precision"
     )
-    parser.add_argument("--max-new-tokens", type=int, default=50)
+    parser.add_argument("--max-length", type=int, default=50)
+    parser.add_argument("--prompt-file", type=str, required=True)
     parser.add_argument("--do-sample", action="store_true", help="Use sampling")
     parser.add_argument(
         "--save-peft-tensors",
@@ -52,24 +53,28 @@ def main():
         help="Save PEFT hidden states and weights to file",
     )
     args = parser.parse_args()
-    peft_model_id = args.peft_model_id
-    use_full_precision = args.use_full_precision
-    max_new_tokens = args.max_new_tokens
-    save_peft_tensors = args.save_peft_tensors
 
-    # Change working dir to folder storing this script
-    abspath = os.path.abspath(__file__)
-    dname = os.path.dirname(abspath)
-    os.chdir(dname)
+    # Check if prompt-file exists
+    if not os.path.isfile(args.prompt_file):
+        print(f"Error: {args.prompt_file} does not exist.")
+        return
 
-    config = PeftConfig.from_pretrained(peft_model_id)
+    # Get peft model config
+    config = PeftConfig.from_pretrained(args.peft_model_id)
+    
+    # Load the base model
     model = AutoModelForCausalLM.from_pretrained(
         config.base_model_name_or_path,
         return_dict=True,
         # load_in_8bit=True,
-        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
         device_map="auto",
     )
+    # Load the Lora model
+    model = PeftModel.from_pretrained(model, args.peft_model_id)
+    print(model)
+    
+    # Get tokenizer
     hf_config = AutoConfig.from_pretrained(
         config.base_model_name_or_path, trust_remote_code=True
     )
@@ -78,25 +83,26 @@ def main():
         tokenizer = LlamaTokenizer.from_pretrained(
             config.base_model_name_or_path,
             use_fast=True,
-            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
         )
     else:
         tokenizer = AutoTokenizer.from_pretrained(
             config.base_model_name_or_path,
-            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
         )
+    
     # Generation config
     generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path)
     generation_config.do_sample = args.do_sample
-    # Load the Lora model
-    model = PeftModel.from_pretrained(model, peft_model_id)
-
-    print(model)
 
     # Register hooks to save tensors, if needed
-    if save_peft_tensors:
+    if args.save_peft_tensors:
+        # Change working dir to folder storing this script
+        abspath = os.path.abspath(__file__)
+        dname = os.path.dirname(abspath)
+        os.chdir(dname)
+        # Create output dir
         shutil.rmtree("./hf_peft_tensors")
-        # Check that the output folder exists
         os.makedirs("./hf_peft_tensors", exist_ok=True)
         # Save weights
         for name, params in model.named_parameters():
@@ -112,12 +118,22 @@ def main():
                 layer.register_forward_pre_hook(peft_pre_forward_hook)
                 layer.register_forward_hook(peft_post_forward_hook)
 
-    batch = tokenizer("Two things are infinite: ", return_tensors="pt")
-    with torch.cuda.amp.autocast():
-        output_tokens = model.generate(
-            **batch, max_new_tokens=max_new_tokens, generation_config=generation_config
-        )
-    print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
+    # Run inference
+    # Read prompt-file into a list of strings
+    with open(args.prompt_file, "r") as f:
+        try:
+            prompt_list = json.load(f)
+        except json.JSONDecodeError:
+            print(f"Error: Unable to parse {args.prompt_file} as JSON.")
+            sys.exit(1)
+    
+    for i, prompt in enumerate(prompt_list):
+        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+        with torch.cuda.amp.autocast():
+            output_tokens = model.generate(
+                **batch, max_new_tokens=args.max_length, generation_config=generation_config
+            )
+        print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
 
 
 if __name__ == "__main__":
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 29b3e6520c..9b4a5204ac 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -25,4 +25,8 @@ export LEGION_BACKTRACE=1
 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m 
 # if first time, add: --refresh-cache
 
-./inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft
+# CPP test
+../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft
+
+# Python test
+python ../inference/python/ff_peft.py

From 0ed889af28ce05ae2862b1d905085744492911cc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 7 Apr 2024 20:48:20 -0700
Subject: [PATCH 164/198] fix

---
 include/flexflow/fftype.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 099b58c82e..3e482b8d67 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -4,6 +4,7 @@
 #include "flexflow/ffconst.h"
 #include <cstddef>
 #include <functional>
+#include <iostream>
 
 namespace FlexFlow {
 

From 48c431a393beec8902f59e5839379e4e6d6b8999 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 11 Apr 2024 14:20:31 -0700
Subject: [PATCH 165/198] update

---
 include/flexflow/request_manager.h |   1 +
 inference/peft/peft.cc             |  66 ++++++++++-----
 src/runtime/file_loader.cc         |   5 +-
 src/runtime/request_manager.cc     | 132 ++++++++++++++++++++---------
 src/runtime/request_manager.cu     |  15 +++-
 5 files changed, 153 insertions(+), 66 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index cbd0b3ad05..f3538c1c68 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -80,6 +80,7 @@ struct Request {
   RequestType req_type = REQ_INFERENCE;
   int completed_training_steps = 0;
   int max_training_steps = 1;
+  int benchmarking_tokens = -1;
   std::string dataset_filepath;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index eade2eaeeb..a6fd3b99b0 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -49,7 +49,8 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
-                      int &max_sequence_length) {
+                      int &max_sequence_length,
+                      int &max_requests_to_run) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -118,6 +119,10 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     paths.cache_folder_path = "~/.cache/flexflow";
@@ -148,6 +153,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -165,7 +171,8 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                   max_sequence_length);
+                   max_sequence_length,
+                   max_requests_to_run);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -301,27 +308,42 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-    // for (auto &prompt : prompt_json) {
-    //   std::string text = prompt.get<std::string>();
-    //   printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-    //   Request inference_req;
-    //   inference_req.prompt = text;
-    //   inference_req.max_sequence_length = 128;
-    //   inference_req.peft_model_id = peft_model_id;
-    //   requests.push_back(inference_req);
-    //   total_num_requests++;
-    // }
+    std::vector<std::pair<int, int>> prompts;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+      if (index >= max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry["human"];
+      int sequence_length = entry["gpt"];
+      assert(prompt_length + sequence_length <= max_sequence_length &&
+             "Prompt + sequence length exceeds max sequence length");
+      prompts.push_back(std::make_pair(prompt_length, sequence_length));
+      index++;
+    }
+    printf("Total number of prompts: %d", prompts.size());
+    for (auto &prompt : prompts) {
+      // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+      // sequence_length);
+      Request inference_req;
+      inference_req.benchmarking_tokens = prompt.first;
+      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+      total_num_requests++;
+    }
 
-    // Add fine-tuning request
-    Request fine_tuning_req;
-    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-    fine_tuning_req.max_sequence_length = 128;
-    fine_tuning_req.peft_model_id =
-        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-    fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
-    fine_tuning_req.max_training_steps = 1;
-    requests.push_back(fine_tuning_req);
-    total_num_requests++;
+    // // Add fine-tuning request
+    // Request fine_tuning_req;
+    // fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    // fine_tuning_req.max_sequence_length = 128;
+    // fine_tuning_req.peft_model_id =
+    //     (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
+    // fine_tuning_req.max_training_steps = 1;
+    // requests.push_back(fine_tuning_req);
+    // total_num_requests++;
 
     std::vector<GenerationResult> result = model.generate(requests);
   }
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index ed88dc0a99..fd31f21b26 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -759,7 +759,8 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                              ? ".attn_bias"
                              : ((weight_idx == 1) ? ".weight" : ".bias");
       std::cout << "Loading weight file " << weight_filename << std::endl;
-      std::string weight_filepath = join_path({weights_folder, weight_filename});
+      std::string weight_filepath =
+          join_path({weights_folder, weight_filename});
       load_from_file(data, volume, weight_filepath);
     } else {
       // default op
@@ -769,7 +770,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
         weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
-  } 
+  }
 
   // Copy the weight data from the buffer to the weight's ParallelTensor
   ParallelTensor weight_pt;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ef7068e330..5ec230298a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -239,17 +239,26 @@ RequestManager::RequestGuid
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
-  std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
-  if (tokens.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << tokens.size() << ".\n";
-    return INVALID_GUID;
-  }
-  for (int i = 0; i < tokens.size(); i++) {
-    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+  if (request_.benchmarking_tokens >= 0) {
+    assert(request_.benchmarking_tokens < get_max_sequence_length());
+    request.benchmarking_tokens = request_.benchmarking_tokens;
+    request.tokens.insert(request.tokens.end(),
+                          request_.benchmarking_tokens,
+                          15); // insert random number
+  } else {
+    std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
+    if (tokens.size() >= get_max_sequence_length()) {
+      std::cout << "Warning: too many tokens in prompt, only load up to "
+                << get_max_sequence_length() << " tokens, but got "
+                << tokens.size() << ".\n";
+      return INVALID_GUID;
+    }
+    for (int i = 0; i < tokens.size(); i++) {
+      std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+    }
+    request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   }
-  request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
+
   request.initial_len = request.tokens.size();
 
   if (get_num_ssms() == 0) {
@@ -558,20 +567,27 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           if (!output_filepath.empty()) {
             std::ofstream outputFile(output_filepath, std::ios::app);
             if (outputFile.is_open()) {
-              outputFile << "end-to-end latency: " << std::fixed
-                         << std::setprecision(3) << total_request_run_time
-                         << std::endl;
-              outputFile << "num decoding steps: "
-                         << profile_info.llm_decoding_steps << std::endl;
-              outputFile << "token IDs: ";
-              for (int i = 0; i < request.tokens.size(); i++) {
-                outputFile << request.tokens[i];
-                if (i < request.tokens.size() - 1) {
-                  outputFile << ",";
-                }
-              }
-              outputFile << std::endl;
-              outputFile << output;
+              outputFile << "[Profile] guid(" << request.guid
+                         << ") llm_decoding_steps("
+                         << profile_info.llm_decoding_steps << ") latency("
+                         << std::fixed << std::setprecision(3)
+                         << (profile_info.finish_time - profile_info.start_time)
+                         << ")\n";
+              // outputFile << "end-to-end latency: " << std::fixed
+              //            << std::setprecision(3) << total_request_run_time
+              //            << std::endl;
+              // outputFile << "num decoding steps: "
+              //            << profile_info.llm_decoding_steps << std::endl;
+              // outputFile << "token IDs: ";
+              // for (int i = 0; i < request.tokens.size(); i++) {
+              //   outputFile << request.tokens[i];
+              //   if (i < request.tokens.size() - 1) {
+              //     outputFile << ",";
+              //   }
+              // }
+              // outputFile << std::endl;
+              // outputFile << output;
+              // outputFile << std::endl;
               outputFile.close();
             } else {
               std::cout << "Unable to open the output file: " << output_filepath
@@ -603,8 +619,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             new_bc.requestsInfo[i].prompt_phase = false;
           } else {
             // Prompt phase
+            assert(old_bc.requestsInfo[i].prompt_phase == true);
+            int space_for_incr_dec_requests = 0;
+            for (int ii = i + 1; i < BatchConfig::max_requests_per_batch();
+                 ii++) {
+              if (!old_bc.request_completed[ii] &&
+                  !old_bc.requestsInfo[ii].prompt_phase) {
+                space_for_incr_dec_requests++;
+              }
+            }
             new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-                get_max_tokens_per_batch() - new_bc.num_tokens,
+                get_max_tokens_per_batch() - new_bc.num_tokens -
+                    space_for_incr_dec_requests,
                 (int)request.tokens.size() -
                     new_bc.requestsInfo[i].first_token_depth_in_request);
             new_bc.requestsInfo[i].prompt_phase = true;
@@ -733,7 +759,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       }
     }
   }
-
+  // pid_t pid = getpid();
+  // std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt";
+  // std::ofstream filen(filenamen);
+  // if (filen.is_open()) {
+  //     filen << new_bc << std::endl;
+  //     filen.close();
+  //     std::cout << "String written to file: " << filenamen << std::endl;
+  // } else {
+  //     std::cout << "Unable to open file: " << filenamen << std::endl;
+  // }
+  // std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt";
+  // std::ofstream fileo(filenameo);
+  // if (fileo.is_open()) {
+  //     fileo << old_bc << std::endl;
+  //     fileo.close();
+  //     std::cout << "String written to file: " << filenameo << std::endl;
+  // } else {
+  //     std::cout << "Unable to open file: " << filenameo << std::endl;
+  // }
   return new_bc;
 }
 
@@ -905,21 +949,27 @@ BeamSearchBatchConfig
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
           if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
-              }
-            }
-            outputFile << std::endl;
-            outputFile << output;
-
+            outputFile << "[Profile] guid(" << request.guid
+                       << ") llm_decoding_steps("
+                       << profile_info.llm_decoding_steps << ") latency("
+                       << std::fixed << std::setprecision(3)
+                       << (profile_info.finish_time - profile_info.start_time)
+                       << ")\n";
+            // outputFile << "end-to-end latency: " << std::fixed
+            //            << std::setprecision(3) << total_request_run_time
+            //            << std::endl;
+            // outputFile << "num decoding steps: "
+            //            << profile_info.llm_decoding_steps << std::endl;
+            // outputFile << "token IDs: ";
+            // for (int i = 0; i < request.tokens.size(); i++) {
+            //   outputFile << request.tokens[i];
+            //   if (i < request.tokens.size() - 1) {
+            //     outputFile << ",";
+            //   }
+            // }
+            // outputFile << std::endl;
+            // outputFile << output;
+            // outputFile << std::endl;
             outputFile.close();
           } else {
             std::cout << "Unable to open the output file: " << output_filepath
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 8380d6be73..235d435580 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -40,8 +40,21 @@ void RequestManager::load_tokens_task(
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
            BatchConfig::max_tokens_per_batch());
     printf("Got: %d tokens\n", batch_config->num_tokens);
+
+    // pid_t pid = getpid();
+    // std::string filename = "bc_" + std::to_string(pid) + ".txt";
+    // std::ofstream file(filename);
+    // if (file.is_open()) {
+    //     file << *batch_config << std::endl;
+    //     file.close();
+    //     std::cout << "String written to file: " << filename << std::endl;
+    // } else {
+    //     std::cout << "Unable to open file: " << filename << std::endl;
+    // }
+
   } else if (batch_config->num_tokens >
-             BatchConfig::max_verify_tokens_per_batch()) {
+                 BatchConfig::max_verify_tokens_per_batch() &&
+             batch_config->get_mode() != INC_DECODING_MODE) {
     printf("Warning: Speculative decoding. too many tokens in prompt, only "
            "load up to %d tokens\n",
            BatchConfig::max_verify_tokens_per_batch());

From 40649ee25a2ea36e25b55f37319777c95158af6d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 11 Apr 2024 17:05:16 -0700
Subject: [PATCH 166/198] fix

---
 inference/utils/download_peft_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index ad79816f84..596612d8d7 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import flexflow.serve as ff
-import argparse
+import argparse, os
 
 
 def parse_args():
@@ -15,7 +15,7 @@ def parse_args():
         "--cache-folder",
         type=str,
         help="Folder to use to store the model(s) assets in FlexFlow format",
-        default="",
+        default=os.environ.get("FF_CACHE_PATH", ""),
     )
     parser.add_argument(
         "--refresh-cache",

From 0580d7e6b1ce34048e3e6fbb9572ebc0461c7d14 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 13 Apr 2024 11:52:17 -0700
Subject: [PATCH 167/198] fix to support prompts larger than max tokens per
 batch

---
 include/flexflow/batch_config.h    |  4 +-
 include/flexflow/request_manager.h |  2 +
 src/runtime/batch_config.cc        |  6 ++
 src/runtime/request_manager.cc     | 99 ++++++++++++++++++++----------
 4 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 3aebfe908d..28fca9067a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -77,6 +77,8 @@ class BatchConfig {
       num_tokens_in_batch = 0;
       max_sequence_length = 0;
       request_guid = 0;
+      prompt_phase = false;
+      batch_config_request_id = -1;
       peft_model_id = PEFTModelID::NO_ID;
       peft_bwd = false;
     }
@@ -86,7 +88,7 @@ class BatchConfig {
     int max_sequence_length;
 
     // request id in batch config:
-    int batch_config_request_id;
+    int batch_config_request_id = -1;
     bool prompt_phase = false;
     RequestGuid request_guid;
     // PEFT fields
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f3538c1c68..a7e67487bb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -167,6 +167,8 @@ class RequestManager {
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
   // Methods for preparing next batches
+  bool check_inf_req_completion(BatchConfig const &old_bc, int i);
+  void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 588ed61802..027ca7f5c0 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -120,6 +120,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
      << std::endl;
   os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl;
   os << "Number of requests: " << bc.num_active_requests() << std::endl;
+  os << "Number of generation tokens: " << bc.num_generation_tokens
+     << std::endl;
 
   // Per-request info
   os << "Per-request info:\n";
@@ -133,6 +135,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      os << "    Prompt phase: " << bc.requestsInfo[i].prompt_phase
+         << std::endl;
+      os << "    BatchConfig Req ID: "
+         << bc.requestsInfo[i].batch_config_request_id << std::endl;
       // PEFT values
       os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
          << std::endl;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5ec230298a..7eb9be598f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -444,6 +444,62 @@ BatchConfig RequestManager::prepare_next_batch_task(
   return rm->prepare_next_batch(*bc, result);
 }
 
+bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
+                                              int i) {
+  Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+  bool request_completed = false;
+  // printf("model_type = %d\n", this->model_type);
+  if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+    request_completed = true;
+  } else if (request.tokens.back() == eos_token_id) {
+    // Encounter EOS token id
+    request_completed = true;
+  }
+  return request_completed;
+}
+
+void RequestManager::check_batch(BatchConfig const &old_bc,
+                                 BatchConfig const &new_bc) {
+  int num_incomplete_prompts = 0;
+  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+    if (new_bc.request_completed[i]) {
+      continue;
+    }
+    // ensure there is no request with zero tokens
+    assert(new_bc.requestsInfo[i].num_tokens_in_batch > 0);
+    // ensure there is no more than one incomplete prompt
+    if (new_bc.requestsInfo[i].prompt_phase &&
+        new_bc.requestsInfo[i].num_tokens_in_batch +
+                new_bc.requestsInfo[i].first_token_depth_in_request <
+            all_requests[new_bc.requestsInfo[i].request_guid].tokens.size()) {
+      num_incomplete_prompts++;
+    }
+  }
+  if (num_incomplete_prompts > 1) {
+    std::cout << "Error: more than one incomplete prompt in the batch\n";
+    pid_t pid = getpid();
+    std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt";
+    std::ofstream filen(filenamen);
+    if (filen.is_open()) {
+      filen << new_bc << std::endl;
+      filen.close();
+      std::cout << "String written to file: " << filenamen << std::endl;
+    } else {
+      std::cout << "Unable to open file: " << filenamen << std::endl;
+    }
+    std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt";
+    std::ofstream fileo(filenameo);
+    if (fileo.is_open()) {
+      fileo << old_bc << std::endl;
+      fileo.close();
+      std::cout << "String written to file: " << filenameo << std::endl;
+    } else {
+      std::cout << "Unable to open file: " << filenameo << std::endl;
+    }
+    assert(false);
+  }
+}
+
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
@@ -518,15 +574,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             old_bc.requestsInfo[i].first_token_depth_in_request +
             old_bc.requestsInfo[i].num_tokens_in_batch;
         assert(processed_tokens < request.tokens.size());
-        bool request_completed = false;
-        // printf("model_type = %d\n", this->model_type);
-        if (request.tokens.size() >=
-            old_bc.requestsInfo[i].max_sequence_length) {
-          request_completed = true;
-        } else if (request.tokens.back() == eos_token_id) {
-          // Encounter EOS token id
-          request_completed = true;
-        }
+        bool request_completed = check_inf_req_completion(old_bc, i);
         if (request_completed) {
           std::string output = this->tokenizer_->Decode(request.tokens);
           // Unlike Huggingface, the sentencepiece C++ library automatically
@@ -621,10 +669,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             // Prompt phase
             assert(old_bc.requestsInfo[i].prompt_phase == true);
             int space_for_incr_dec_requests = 0;
-            for (int ii = i + 1; i < BatchConfig::max_requests_per_batch();
+            // If the prompt can't fit in the batch, compute how much space we
+            // need to leave out for incomplete requests in decoding phase at
+            // higher indices.
+            for (int ii = i + 1; ii < BatchConfig::max_requests_per_batch();
                  ii++) {
-              if (!old_bc.request_completed[ii] &&
-                  !old_bc.requestsInfo[ii].prompt_phase) {
+              if (old_bc.request_completed[ii]) {
+                continue;
+              }
+              Request &old_request =
+                  all_requests[old_bc.requestsInfo[ii].request_guid];
+              bool req_completed = check_inf_req_completion(old_bc, ii);
+              if (!req_completed) {
                 space_for_incr_dec_requests++;
               }
             }
@@ -759,25 +815,6 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       }
     }
   }
-  // pid_t pid = getpid();
-  // std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt";
-  // std::ofstream filen(filenamen);
-  // if (filen.is_open()) {
-  //     filen << new_bc << std::endl;
-  //     filen.close();
-  //     std::cout << "String written to file: " << filenamen << std::endl;
-  // } else {
-  //     std::cout << "Unable to open file: " << filenamen << std::endl;
-  // }
-  // std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt";
-  // std::ofstream fileo(filenameo);
-  // if (fileo.is_open()) {
-  //     fileo << old_bc << std::endl;
-  //     fileo.close();
-  //     std::cout << "String written to file: " << filenameo << std::endl;
-  // } else {
-  //     std::cout << "Unable to open file: " << filenameo << std::endl;
-  // }
   return new_bc;
 }
 

From 0affe2748d13fa4109e814af725d15fb551f9bee Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 13 Apr 2024 21:09:27 -0700
Subject: [PATCH 168/198] fixes to support benchmarking of finetuning
 throughput

---
 include/flexflow/batch_config.h    |   2 +-
 include/flexflow/request_manager.h |  10 +
 inference/peft/peft.cc             |  43 ++-
 src/runtime/request_manager.cc     | 522 ++++++++++++++++-------------
 4 files changed, 333 insertions(+), 244 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 28fca9067a..ade519cd38 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -59,7 +59,7 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
+  static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a7e67487bb..524d4828ec 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -78,7 +78,9 @@ struct Request {
   std::vector<struct BeamTree> beam_trees;
   // PEFT field
   RequestType req_type = REQ_INFERENCE;
+  size_t processed_finetuning_tokens = 0;
   int completed_training_steps = 0;
+  int dataset_entry_processed_tokens = 0;
   int max_training_steps = 1;
   int benchmarking_tokens = -1;
   std::string dataset_filepath;
@@ -132,6 +134,9 @@ class RequestManager {
   void set_max_sequence_length(int max_seq_length);
   void push_spec_infer_tree_width(int tree_width);
   int get_max_sequence_length();
+  void set_enable_peft_finetuning(bool enable_peft_finetuning_);
+  void set_disable_peft_bwd(bool disable_peft_bwd_);
+  static void set_inference_finished();
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -278,6 +283,11 @@ class RequestManager {
   int max_sequence_length;
   Status request_manager_status;
 
+  // peft benchmarking
+  bool enable_peft_finetuning = false;
+  bool disable_peft_bwd = false;
+  static bool inference_finished;
+
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
 
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index a6fd3b99b0..5c96709be7 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -50,7 +50,9 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &max_requests_to_run) {
+                      int &max_requests_to_run,
+                      bool &enable_peft_finetuning,
+                      bool &disable_peft_bwd) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -123,6 +125,14 @@ void parse_input_args(char **argv,
       max_requests_to_run = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
+      enable_peft_finetuning = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-disable-peft-bwd")) {
+      disable_peft_bwd = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     paths.cache_folder_path = "~/.cache/flexflow";
@@ -154,6 +164,8 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
   int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+  bool disable_peft_bwd = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -172,7 +184,9 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   max_requests_to_run);
+                   max_requests_to_run,
+                   enable_peft_finetuning,
+                   disable_peft_bwd);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -242,12 +256,16 @@ void FlexFlow::top_level_task(Task const *task,
 
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
-  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+  rm->set_disable_peft_bwd(disable_peft_bwd);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {
@@ -334,16 +352,17 @@ void FlexFlow::top_level_task(Task const *task,
       total_num_requests++;
     }
 
-    // // Add fine-tuning request
-    // Request fine_tuning_req;
-    // fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-    // fine_tuning_req.max_sequence_length = 128;
-    // fine_tuning_req.peft_model_id =
-    //     (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    // Add fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
     // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
-    // fine_tuning_req.max_training_steps = 1;
-    // requests.push_back(fine_tuning_req);
-    // total_num_requests++;
+    fine_tuning_req.max_training_steps = 1000000000;
+    requests.push_back(fine_tuning_req);
+    total_num_requests++;
 
     std::vector<GenerationResult> result = model.generate(requests);
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7eb9be598f..b8ca019d3f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -89,6 +89,8 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
   return os;
 }
 
+bool RequestManager::inference_finished = false;
+
 RequestManager::RequestManager()
     : request_manager_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -160,6 +162,18 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) {
   spec_infer_tree_width.emplace_back(tree_width);
 }
 
+void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) {
+  enable_peft_finetuning = enable_peft_finetuning_;
+}
+
+void RequestManager::set_disable_peft_bwd(bool disable_peft_bwd_) {
+  disable_peft_bwd = disable_peft_bwd_;
+}
+
+void RequestManager::set_inference_finished() {
+  inference_finished = true;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -315,31 +329,40 @@ RequestManager::RequestGuid
   request.dataset_filepath = request_.dataset_filepath;
 
   // Load dataset
-  using json = nlohmann::json;
-  std::ifstream file_handle(request.dataset_filepath);
-  assert(file_handle.good() && "Dataset file does not exist.");
-  json dataset_json = json::parse(file_handle,
-                                  /*parser_callback_t */ nullptr,
-                                  /*allow_exceptions */ true,
-                                  /*ignore_comments */ true);
-
-  for (auto &prompt : dataset_json) {
-    std::string text = prompt.get<std::string>();
-    std::string output_text("");
-    std::vector<int32_t> input_tokens;
-    input_tokens = this->tokenizer_->Encode(text);
-    if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
-      input_tokens.insert(input_tokens.begin(), bos_token_id);
-    }
-    std::vector<int32_t> output_tokens = this->tokenizer_->Encode(output_text);
-    if (input_tokens.size() + output_tokens.size() >
-        get_max_sequence_length()) {
-      std::cout << "Warning: too many tokens in sample, only load up to "
-                << get_max_sequence_length() << " tokens, but got "
-                << input_tokens.size() + output_tokens.size() << ".\n";
-      return INVALID_GUID;
-    } else {
-      request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
+  if (request_.benchmarking_tokens >= 0) {
+    assert(request_.benchmarking_tokens == get_max_sequence_length());
+    request.benchmarking_tokens = request_.benchmarking_tokens;
+    request.tokens.insert(request.tokens.end(),
+                          request_.benchmarking_tokens,
+                          15); // insert random number
+  } else {
+    using json = nlohmann::json;
+    std::ifstream file_handle(request.dataset_filepath);
+    assert(file_handle.good() && "Dataset file does not exist.");
+    json dataset_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+
+    for (auto &prompt : dataset_json) {
+      std::string text = prompt.get<std::string>();
+      std::string output_text("");
+      std::vector<int32_t> input_tokens;
+      input_tokens = this->tokenizer_->Encode(text);
+      if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+        input_tokens.insert(input_tokens.begin(), bos_token_id);
+      }
+      std::vector<int32_t> output_tokens =
+          this->tokenizer_->Encode(output_text);
+      if (input_tokens.size() + output_tokens.size() >
+          get_max_sequence_length()) {
+        std::cout << "Warning: too many tokens in sample, only load up to "
+                  << get_max_sequence_length() << " tokens, but got "
+                  << input_tokens.size() + output_tokens.size() << ".\n";
+        return INVALID_GUID;
+      } else {
+        request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
+      }
     }
   }
 
@@ -504,15 +527,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Step 1: append result from previous iteration to request's tokens
-  for (int i = 0; i < old_bc.num_tokens; i++) {
+  for (int i = 0; i < old_bc.num_active_infr_tokens(); i++) {
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
-    if (request.req_type == RequestType::REQ_FINETUNING) {
-      // No new tokens generated when in fine-tuning mode
-      continue;
-    } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 <
-               request.tokens.size()) {
+    assert(request.req_type == RequestType::REQ_INFERENCE &&
+           "Found misplaced finetuning request");
+    if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
       // This is a prompt token
       continue;
     } else {
@@ -525,192 +546,146 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       // log_req_mgr.print("Output: %s", output.c_str());
     }
   }
+
   int num_generation_tokens = 0;
   int num_active_req = -1;
 
+  // when finetuning is enabled, the last entry in the batch cannot be used for
+  // inference
+  int inference_batch_size =
+      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+
   // Step 2: prepare the next batch for existing requests
   BatchConfig new_bc;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+  for (int i = 0; i < inference_batch_size; i++) {
     if (old_bc.request_completed[i]) {
       // no need to carry over tokens to new batch for this request
       continue;
     } else {
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-
-      if (request.req_type == RequestType::REQ_FINETUNING) {
-        // fine-tuning requests don't automatically carry over to the next
-        // batch, we only do so if there is space left after adding new
-        // inference requests
-        request.completed_training_steps += 1;
-        assert(request.completed_training_steps <= request.max_training_steps);
-        if (request.completed_training_steps == request.max_training_steps) {
-          // check if the fine tuning request has completed
-          request.status = Request::COMPLETED;
-          trigger_request_completion_future(request.guid);
-          log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)",
-                            old_bc.requestsInfo[i].request_guid,
-                            request.completed_training_steps);
+      assert(request.req_type == RequestType::REQ_INFERENCE &&
+             "Found misplaced finetuning request");
+
+      int processed_tokens =
+          old_bc.requestsInfo[i].first_token_depth_in_request +
+          old_bc.requestsInfo[i].num_tokens_in_batch;
+      assert(processed_tokens < request.tokens.size());
+      bool request_completed = check_inf_req_completion(old_bc, i);
+      if (request_completed) {
+        std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
+        {
+          // update generation result
           GenerationResult &gr = request_generation_results[request.guid];
           assert(gr.guid == request.guid);
-          num_processed_requests++;
-          ProfileInfo profile_info = profiling_requests[request.guid];
-          profile_info.finish_time =
-              Realm::Clock::current_time_in_microseconds();
-          total_request_run_time +=
-              profile_info.finish_time - profile_info.start_time;
-          profiling_requests[request.guid] = profile_info;
-          log_req_mgr.print(
-              "[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) "
-              "finish(%.1lf) latency(%.1lf)",
-              request.guid,
-              request.completed_training_steps,
-              profile_info.start_time,
-              profile_info.finish_time,
-              profile_info.finish_time - profile_info.start_time);
+          gr.output_tokens = request.tokens;
+          gr.output_text = output;
         }
-      } else {
-        int processed_tokens =
-            old_bc.requestsInfo[i].first_token_depth_in_request +
-            old_bc.requestsInfo[i].num_tokens_in_batch;
-        assert(processed_tokens < request.tokens.size());
-        bool request_completed = check_inf_req_completion(old_bc, i);
-        if (request_completed) {
-          std::string output = this->tokenizer_->Decode(request.tokens);
-          // Unlike Huggingface, the sentencepiece C++ library automatically
-          // removes the BOS token
-          if (model_type == ModelType::LLAMA &&
-              request.tokens.at(0) == bos_token_id) {
-            output = "<s> " + output;
-          }
-          {
-            // update generation result
-            GenerationResult &gr = request_generation_results[request.guid];
-            assert(gr.guid == request.guid);
-            gr.output_tokens = request.tokens;
-            gr.output_text = output;
-          }
-          request.status = Request::COMPLETED;
-          trigger_request_completion_future(request.guid);
-          log_req_mgr.print("[Done] guid(%zu) final_length(%zu)",
-                            old_bc.requestsInfo[i].request_guid,
-                            request.tokens.size());
-          log_req_mgr.print("Final output: %s", output.c_str());
-          num_processed_requests++;
-          ProfileInfo profile_info = profiling_requests[request.guid];
-          profile_info.finish_time =
-              Realm::Clock::current_time_in_microseconds();
-          total_request_run_time +=
-              profile_info.finish_time - profile_info.start_time;
-          profiling_requests[request.guid] = profile_info;
-          log_req_mgr.print(
-              "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-              "finish(%.1lf) latency(%.1lf)",
-              request.guid,
-              profile_info.llm_decoding_steps,
-              profile_info.start_time,
-              profile_info.finish_time,
-              profile_info.finish_time - profile_info.start_time);
-          // Write output to file if needed:
-          if (!output_filepath.empty()) {
-            std::ofstream outputFile(output_filepath, std::ios::app);
-            if (outputFile.is_open()) {
-              outputFile << "[Profile] guid(" << request.guid
-                         << ") llm_decoding_steps("
-                         << profile_info.llm_decoding_steps << ") latency("
-                         << std::fixed << std::setprecision(3)
-                         << (profile_info.finish_time - profile_info.start_time)
-                         << ")\n";
-              // outputFile << "end-to-end latency: " << std::fixed
-              //            << std::setprecision(3) << total_request_run_time
-              //            << std::endl;
-              // outputFile << "num decoding steps: "
-              //            << profile_info.llm_decoding_steps << std::endl;
-              // outputFile << "token IDs: ";
-              // for (int i = 0; i < request.tokens.size(); i++) {
-              //   outputFile << request.tokens[i];
-              //   if (i < request.tokens.size() - 1) {
-              //     outputFile << ",";
-              //   }
-              // }
-              // outputFile << std::endl;
-              // outputFile << output;
-              // outputFile << std::endl;
-              outputFile.close();
-            } else {
-              std::cout << "Unable to open the output file: " << output_filepath
-                        << std::endl;
-              assert(false);
-            }
+        request.status = Request::COMPLETED;
+        trigger_request_completion_future(request.guid);
+        log_req_mgr.print("[Done] guid(%zu) final_length(%zu)",
+                          old_bc.requestsInfo[i].request_guid,
+                          request.tokens.size());
+        log_req_mgr.print("Final output: %s", output.c_str());
+        num_processed_requests++;
+        ProfileInfo profile_info = profiling_requests[request.guid];
+        profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+        total_request_run_time +=
+            profile_info.finish_time - profile_info.start_time;
+        profiling_requests[request.guid] = profile_info;
+        log_req_mgr.print(
+            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
+            "finish(%.1lf) latency(%.1lf)",
+            request.guid,
+            profile_info.llm_decoding_steps,
+            profile_info.start_time,
+            profile_info.finish_time,
+            profile_info.finish_time - profile_info.start_time);
+        // Write output to file if needed:
+        if (!output_filepath.empty()) {
+          std::ofstream outputFile(output_filepath, std::ios::app);
+          if (outputFile.is_open()) {
+            outputFile << "[Profile] guid(" << request.guid
+                       << ") llm_decoding_steps("
+                       << profile_info.llm_decoding_steps << ") latency("
+                       << std::fixed << std::setprecision(3)
+                       << (profile_info.finish_time - profile_info.start_time)
+                       << ")\n";
+            outputFile.close();
+          } else {
+            std::cout << "Unable to open the output file: " << output_filepath
+                      << std::endl;
+            assert(false);
           }
-
+        }
+      } else {
+        new_bc.request_completed[i] = false;
+        new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
+        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
+        new_bc.requestsInfo[i].request_guid =
+            old_bc.requestsInfo[i].request_guid;
+        new_bc.requestsInfo[i].peft_model_id =
+            old_bc.requestsInfo[i].peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
+        new_bc.requestsInfo[i].max_sequence_length =
+            old_bc.requestsInfo[i].max_sequence_length;
+        num_active_req++;
+        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
+        if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
+            request.tokens.size()) {
+          // Incremental phase
+          new_bc.requestsInfo[i].num_tokens_in_batch = 1;
+          num_generation_tokens++;
+          new_bc.requestsInfo[i].prompt_phase = false;
         } else {
-          new_bc.request_completed[i] = false;
-          new_bc.requestsInfo[i].first_token_depth_in_request =
-              processed_tokens;
-          new_bc.requestsInfo[i].first_token_offset_in_batch =
-              new_bc.num_tokens;
-          new_bc.requestsInfo[i].request_guid =
-              old_bc.requestsInfo[i].request_guid;
-          new_bc.requestsInfo[i].peft_model_id =
-              old_bc.requestsInfo[i].peft_model_id;
-          new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
-          new_bc.requestsInfo[i].max_sequence_length =
-              old_bc.requestsInfo[i].max_sequence_length;
-          num_active_req++;
-          new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-          if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
-              request.tokens.size()) {
-            // Incremental phase
-            new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-            num_generation_tokens++;
-            new_bc.requestsInfo[i].prompt_phase = false;
-          } else {
-            // Prompt phase
-            assert(old_bc.requestsInfo[i].prompt_phase == true);
-            int space_for_incr_dec_requests = 0;
-            // If the prompt can't fit in the batch, compute how much space we
-            // need to leave out for incomplete requests in decoding phase at
-            // higher indices.
-            for (int ii = i + 1; ii < BatchConfig::max_requests_per_batch();
-                 ii++) {
-              if (old_bc.request_completed[ii]) {
-                continue;
-              }
-              Request &old_request =
-                  all_requests[old_bc.requestsInfo[ii].request_guid];
-              bool req_completed = check_inf_req_completion(old_bc, ii);
-              if (!req_completed) {
-                space_for_incr_dec_requests++;
-              }
+          // Prompt phase
+          assert(old_bc.requestsInfo[i].prompt_phase == true);
+          int space_for_incr_dec_requests = 0;
+          // If the prompt can't fit in the batch, compute how much space we
+          // need to leave out for incomplete requests in decoding phase at
+          // higher indices.
+          for (int ii = i + 1; ii < inference_batch_size; ii++) {
+            if (old_bc.request_completed[ii]) {
+              continue;
+            }
+            Request &old_request =
+                all_requests[old_bc.requestsInfo[ii].request_guid];
+            bool req_completed = check_inf_req_completion(old_bc, ii);
+            if (!req_completed) {
+              space_for_incr_dec_requests++;
             }
-            new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-                get_max_tokens_per_batch() - new_bc.num_tokens -
-                    space_for_incr_dec_requests,
-                (int)request.tokens.size() -
-                    new_bc.requestsInfo[i].first_token_depth_in_request);
-            new_bc.requestsInfo[i].prompt_phase = true;
-          }
-          for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-            int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-            new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-            new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-            assert(depth < request.tokens.size());
-            new_bc.tokensInfo[new_bc.num_tokens].token_id =
-                request.tokens[depth];
-            new_bc.num_tokens++;
           }
-          // Update profiling
-          profiling_requests[new_bc.requestsInfo[i].request_guid]
-              .llm_decoding_steps++;
+          new_bc.requestsInfo[i].num_tokens_in_batch =
+              std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
+                           space_for_incr_dec_requests,
+                       (int)request.tokens.size() -
+                           new_bc.requestsInfo[i].first_token_depth_in_request);
+          new_bc.requestsInfo[i].prompt_phase = true;
+        }
+        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
+          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+          assert(depth < request.tokens.size());
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
+          new_bc.num_tokens++;
         }
+        // Update profiling
+        profiling_requests[new_bc.requestsInfo[i].request_guid]
+            .llm_decoding_steps++;
       }
     }
   }
   new_bc.num_generation_tokens = num_generation_tokens;
 
   // Step 3: add new requests to the next batch if there is space
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+  for (int i = 0; i < inference_batch_size; i++) {
     if (new_bc.request_completed[i]) {
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
@@ -754,65 +729,143 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     }
   }
 
+  if (enable_peft_finetuning &&
+      !old_bc.request_completed[inference_batch_size]) {
+    assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0);
+    Request &request =
+        all_requests[old_bc.requestsInfo[inference_batch_size].request_guid];
+    assert(request.req_type == RequestType::REQ_FINETUNING &&
+           "Found misplaced inference request");
+
+    request.dataset_entry_processed_tokens +=
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    request.processed_finetuning_tokens +=
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    int dataset_entry =
+        request.completed_training_steps % request.dataset.size();
+    if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request +
+            old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch ==
+        request.dataset[dataset_entry].first.size()) {
+      // completed the current dataset entry
+      assert(request.dataset_entry_processed_tokens ==
+             request.dataset[dataset_entry].first.size());
+      request.completed_training_steps += 1;
+      request.dataset_entry_processed_tokens = 0;
+    }
+
+    assert(request.completed_training_steps <= request.max_training_steps);
+    if (request.completed_training_steps == request.max_training_steps ||
+        inference_finished) {
+      // check if the fine tuning request has completed
+      request.status = Request::COMPLETED;
+      trigger_request_completion_future(request.guid);
+      GenerationResult &gr = request_generation_results[request.guid];
+      assert(gr.guid == request.guid);
+      num_processed_requests++;
+
+      ProfileInfo profile_info = profiling_requests[request.guid];
+      profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+      total_request_run_time +=
+          profile_info.finish_time - profile_info.start_time;
+      profiling_requests[request.guid] = profile_info;
+      log_req_mgr.print("[Finetuning] guid(%zu) completed_training_steps(%d) "
+                        "processed_finetuning_tokens(%lu) latency(%.1lf)",
+                        request.guid,
+                        request.completed_training_steps,
+                        request.processed_finetuning_tokens,
+                        profile_info.finish_time - profile_info.start_time);
+      if (!output_filepath.empty()) {
+        std::ofstream outputFile(output_filepath, std::ios::app);
+        if (outputFile.is_open()) {
+          outputFile << "[Finetuning] guid(" << request.guid
+                     << ") completed_training_steps("
+                     << request.completed_training_steps
+                     << ") processed_finetuning_tokens("
+                     << request.processed_finetuning_tokens << ") latency("
+                     << std::fixed << std::setprecision(3)
+                     << (profile_info.finish_time - profile_info.start_time)
+                     << ")\n";
+          outputFile.close();
+        } else {
+          std::cout << "Unable to open the output file: " << output_filepath
+                    << std::endl;
+          assert(false);
+        }
+      }
+    }
+  }
+
   // Step 4: add PEFT bwd requests, if there is additional space
   while (pending_peft_request_queue.size() > 0) {
     Request &request = pending_peft_request_queue.front();
-    assert(request.req_type = RequestType::REQ_FINETUNING);
+    // assert(request.req_type = RequestType::REQ_FINETUNING);
     Request &all_req_handle = all_requests[request.guid];
-    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
+    // assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
     if (all_req_handle.status == Request::COMPLETED) {
       pending_peft_request_queue.pop();
     } else {
       break;
     }
   }
-  if (pending_peft_request_queue.size() > 0) {
+
+  if (pending_peft_request_queue.size() > 0 && !inference_finished) {
     Request &request = pending_peft_request_queue.front();
     assert(request.req_type = RequestType::REQ_FINETUNING);
     assert(request.dataset.size() > 0);
     // update status and training steps
     Request &all_req_handle = all_requests[request.guid];
     assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
+
     request.completed_training_steps = all_req_handle.completed_training_steps;
+    request.processed_finetuning_tokens =
+        all_req_handle.processed_finetuning_tokens;
     request.status = all_req_handle.status;
+    int dataset_entry =
+        request.completed_training_steps % request.dataset.size();
+    request.dataset_entry_processed_tokens =
+        all_req_handle.dataset_entry_processed_tokens;
+
     assert(request.status != Request::COMPLETED);
     assert(request.max_training_steps > 0 &&
            request.completed_training_steps < request.max_training_steps);
-    int num_peft_tokens = request.dataset[0].first.size();
-    int num_peft_label_tokens = request.dataset[0].second.size();
-    if (num_peft_tokens + new_bc.num_active_tokens() <=
-        get_max_tokens_per_batch()) {
-      // The last request slot is reserved for PEFT request
-      int peft_req_idx = get_max_requests_per_batch() - 1;
-      assert(new_bc.request_completed[peft_req_idx]);
-      new_bc.request_completed[peft_req_idx] = false;
-      new_bc.requestsInfo[peft_req_idx].first_token_depth_in_request = 0;
-      new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch =
-          new_bc.num_tokens;
-      new_bc.requestsInfo[peft_req_idx].num_tokens_in_batch = num_peft_tokens;
-      new_bc.requestsInfo[peft_req_idx].max_sequence_length =
+    assert(request.dataset_entry_processed_tokens <=
+           request.dataset[dataset_entry].first.size());
+
+    int num_peft_tokens =
+        min((int)request.dataset[dataset_entry].first.size() -
+                request.dataset_entry_processed_tokens,
+            get_max_tokens_per_batch() - new_bc.num_active_infr_tokens());
+    int num_peft_label_tokens = request.dataset[dataset_entry].second.size();
+    assert(num_peft_label_tokens == 0);
+
+    if (num_peft_tokens > 0) {
+      assert(new_bc.request_completed[inference_batch_size]);
+      // request info
+      new_bc.request_completed[inference_batch_size] = false;
+      new_bc.requestsInfo[inference_batch_size].first_token_depth_in_request =
+          request.dataset_entry_processed_tokens;
+      new_bc.requestsInfo[inference_batch_size].first_token_offset_in_batch =
+          new_bc.num_active_infr_tokens();
+      new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
+          num_peft_tokens;
+      new_bc.requestsInfo[inference_batch_size].max_sequence_length =
           request.max_sequence_length;
-      new_bc.requestsInfo[peft_req_idx].request_guid = request.guid;
-      new_bc.requestsInfo[peft_req_idx].peft_model_id = request.peft_model_id;
-      new_bc.requestsInfo[peft_req_idx].peft_bwd = true;
-      for (size_t i = 0; i < request.dataset[0].first.size(); i++) {
+      new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
+      new_bc.requestsInfo[inference_batch_size].peft_model_id =
+          request.peft_model_id;
+      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
+      // tokens info
+      for (size_t i = request.dataset_entry_processed_tokens;
+           i < request.dataset_entry_processed_tokens + num_peft_tokens;
+           i++) {
         new_bc.tokensInfo[new_bc.num_tokens].token_id =
-            request.dataset[0].first[i];
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx;
+            request.dataset[dataset_entry].first[i];
+        new_bc.tokensInfo[new_bc.num_tokens].request_index =
+            inference_batch_size;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i;
         new_bc.num_tokens++;
         new_bc.num_peft_tokens++;
       }
-      for (size_t i = 0; i < request.dataset[0].second.size(); i++) {
-        new_bc.labelsInfo[new_bc.num_peft_label_tokens].token_id =
-            request.dataset[0].second[i];
-        new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index =
-            peft_req_idx;
-        int depth = request.dataset[0].first.size() + i;
-        new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request =
-            depth;
-        new_bc.num_peft_label_tokens++;
-      }
     }
   }
   return new_bc;
@@ -2568,21 +2621,28 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 std::vector<GenerationResult>
     FFModel::generate(std::vector<Request> const &requests) {
   RequestManager *rm = RequestManager::get_request_manager();
-  std::vector<RequestManager::RequestGuid> guids;
+  std::vector<RequestManager::RequestGuid> inf_guids, peft_guids;
   for (int i = 0; i < requests.size(); i++) {
     RequestManager::RequestGuid guid;
     if (requests.at(i).req_type == RequestType::REQ_INFERENCE) {
       guid = rm->register_new_request(requests.at(i));
+      if (guid != RequestManager::INVALID_GUID) {
+        inf_guids.push_back(guid);
+      }
     } else {
       guid = rm->register_new_peft_request(requests.at(i));
-    }
-    if (guid != RequestManager::INVALID_GUID) {
-      guids.push_back(guid);
+      if (guid != RequestManager::INVALID_GUID) {
+        peft_guids.push_back(guid);
+      }
     }
   }
   std::vector<GenerationResult> results;
-  for (int i = 0; i < guids.size(); i++) {
-    results.push_back(rm->get_generation_result(guids[i]));
+  for (int i = 0; i < inf_guids.size(); i++) {
+    results.push_back(rm->get_generation_result(inf_guids[i]));
+  }
+  rm->set_inference_finished();
+  for (int i = 0; i < peft_guids.size(); i++) {
+    results.push_back(rm->get_generation_result(peft_guids[i]));
   }
   return results;
 }
@@ -2740,7 +2800,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime);
     FutureMap fm = im->inference(llm, 0, bcf);
-    if (llm->config.enable_peft) {
+    if (llm->config.enable_peft && !disable_peft_bwd) {
       im->peft_bwd(llm, 0, bcf);
     }
     assert(fm.get_future_map_domain().get_volume() == 1);

From d7ebeaf689f0c8d105aebd3984fcdd3f1e144690 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 14 Apr 2024 19:44:45 -0700
Subject: [PATCH 169/198] many upgrades and updates related to finetuning

---
 .../ops/add_bias_residual_layer_norm.h        |  1 +
 .../ops/inc_multihead_self_attention.h        |  1 +
 include/flexflow/ops/kernels/linear_kernels.h |  1 +
 .../ops/kernels/lora_linear_kernels.h         |  1 +
 .../ops/kernels/residual_rms_norm_kernels.h   |  1 +
 .../flexflow/ops/kernels/rms_norm_kernels.h   |  1 +
 include/flexflow/ops/layer_norm.h             |  1 +
 include/flexflow/ops/residual_layer_norm.h    |  1 +
 include/flexflow/ops/sigmoid_silu_multi.h     |  1 +
 src/ops/add_bias_residual_layer_norm.cu       | 13 +++++++---
 src/ops/inc_multihead_self_attention.cu       | 25 ++++++++++++++-----
 src/ops/kernels/linear_kernels.cu             | 14 ++++++++---
 src/ops/kernels/lora_linear_kernels.cu        | 24 ++++++++++++++----
 src/ops/kernels/residual_rms_norm_kernels.cu  | 13 +++++++---
 src/ops/kernels/rms_norm_kernels.cu           | 13 +++++++---
 src/ops/layer_norm.cu                         | 13 +++++++---
 src/ops/residual_layer_norm.cu                | 13 +++++++---
 src/ops/sigmoid_silu_multi.cpp                |  1 +
 src/ops/sigmoid_silu_multi.cu                 | 12 ++++++---
 src/runtime/request_manager.cc                | 23 +++++++++++------
 20 files changed, 134 insertions(+), 39 deletions(-)

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index 08b7404e14..9510ac0f28 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -159,6 +159,7 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 69f2b8bb6a..f77df7c456 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -222,6 +222,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   // PEFT specific fields
   void *softmax_activation_buffer;
   void *query_activation_buffer;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index bcce9a947a..90e50a0c9a 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -37,6 +37,7 @@ class LinearMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *output_activation_buffer;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 739b94ed22..32608abce2 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -23,6 +23,7 @@ class LoraLinearMeta : public OpMeta {
   void *low_rank_activation;
   void *input_activation;
   std::unordered_map<PEFTModelID, LoraLinearWeight> model_weights;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index dfc9937cc3..fd4e0ecf1d 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -39,6 +39,7 @@ class ResidualRMSNormMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 46297764ec..475b6d94ed 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -38,6 +38,7 @@ class RMSNormMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h
index 17aa4dd504..b5e9538ea6 100644
--- a/include/flexflow/ops/layer_norm.h
+++ b/include/flexflow/ops/layer_norm.h
@@ -151,6 +151,7 @@ class LayerNormMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index a028097905..33a8e8be51 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -145,6 +145,7 @@ class ResidualLayerNormMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
index 28e3bfed3e..ac60ff15dd 100644
--- a/include/flexflow/ops/sigmoid_silu_multi.h
+++ b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -110,6 +110,7 @@ class SigmoidSiluMultiMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   // PEFT related fields
   void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 505806a2b9..bcca1ba2c6 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -45,6 +45,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) {
@@ -221,12 +222,18 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->input_activation = allocator->allocate_instance_untyped(
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 83712232bd..d1b93cb206 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1495,12 +1495,18 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {
-      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-      m->query_activation_buffer = allocator->allocate_instance_untyped(
-          sizeof(DT) * total_tokens * m->num_q_heads * m->qProjSize);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
+      }
       int parallelism = m->hidden_size * num_tokens;
       store_query_cache<<<GET_BLOCKS(parallelism),
                           min(CUDA_NUM_THREADS, parallelism),
@@ -1646,9 +1652,14 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {
       DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-      m->softmax_activation_buffer = allocator->allocate_instance_untyped(
-          sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
+      }
       checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer,
                                 C_softmax,
                                 sizeof(DT) * total_tokens * num_new_tokens *
@@ -2131,6 +2142,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
   cudaStreamSynchronize(stream);
 }
 
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index a3f5c797de..b2e0d3dbad 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -63,6 +63,8 @@ LinearMeta::LinearMeta(FFHandler handler,
   // Allocate descriptors
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+
+  allocated_peft_buffer_size = 0;
 }
 
 LinearMeta::~LinearMeta(void) {
@@ -237,11 +239,17 @@ void inference_kernel_wrapper(LinearMeta *m,
           continue;
         }
         int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
         int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
         if (bc->requestsInfo[i].peft_bwd) {
-          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-          m->output_activation_buffer = allocator->allocate_instance_untyped(
-              data_type_size(m->output_type[0]) * num_peft_tokens * out_dim);
+          size_t activation_size_needed =
+              data_type_size(m->output_type[0]) * max_peft_tokens * out_dim;
+          if (activation_size_needed > m->allocated_peft_buffer_size) {
+            MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+            m->output_activation_buffer =
+                allocator->allocate_instance_untyped(activation_size_needed);
+            m->allocated_peft_buffer_size = activation_size_needed;
+          }
           // copy output activation
           if (m->output_type[0] == DT_FLOAT) {
             checkCUDA(cudaMemcpyAsync(
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 9cd5d2ecfa..55751d96ba 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -21,7 +21,10 @@
 namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
-    : OpMeta(handler, li) {}
+    : OpMeta(handler, li) {
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
+}
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
 
@@ -180,6 +183,7 @@ void inference_kernel(LoraLinearMeta *m,
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_weights.end());
@@ -188,11 +192,21 @@ void inference_kernel(LoraLinearMeta *m,
     int rank = weight.rank;
     void *intermediate_result_ptr = nullptr;
     if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed1 =
+          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+      size_t activation_size_needed2 =
+          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
       MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-      m->input_activation = allocator->allocate_instance_untyped(
-          data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
-      m->low_rank_activation = allocator->allocate_instance_untyped(
-          data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
+        m->input_activation =
+            allocator->allocate_instance_untyped(activation_size_needed1);
+        m->allocated_peft_buffer_size1 = activation_size_needed1;
+      }
+      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
+        m->low_rank_activation =
+            allocator->allocate_instance_untyped(activation_size_needed2);
+        m->allocated_peft_buffer_size2 = activation_size_needed2;
+      }
       // copy input activation
       checkCUDA(cudaMemcpyAsync(m->input_activation,
                                 input_ptr + first_token_offset * in_dim,
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 4b92e70787..0d44f0260a 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -45,6 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -269,12 +270,18 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->input_activation = allocator->allocate_instance_untyped(
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index b11e954622..dd6ada864d 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -44,6 +44,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 RMSNormMeta::~RMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -224,12 +225,18 @@ void inference_kernel_wrapper(RMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->input_activation = allocator->allocate_instance_untyped(
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
 
         if (input.data_type == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index bfbb2faae9..0801d11617 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -50,6 +50,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 LayerNormMeta::~LayerNormMeta(void) {
@@ -254,12 +255,18 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->input_activation = allocator->allocate_instance_untyped(
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 5e736cd6e8..8cdf87a92c 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -46,6 +46,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 ResidualLayerNormMeta::~ResidualLayerNormMeta(void) {
@@ -277,12 +278,18 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-        m->input_activation = allocator->allocate_instance_untyped(
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 0a9a814f5e..bbf27db745 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -26,6 +26,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
     : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
+  allocated_peft_buffer_size = 0;
 }
 
 SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) {
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index e3b6f7a69a..929d557a17 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -129,13 +129,19 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
-        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
         size_t input_tensor_size =
             data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
-        m->input_activation =
-            allocator->allocate_instance_untyped(2 * input_tensor_size);
+        size_t activation_size_needed =
+            2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(m->input_activation,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b8ca019d3f..26922e2e95 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -315,6 +315,7 @@ RequestManager::RequestGuid
 
 RequestManager::RequestGuid
     RequestManager::register_new_peft_request(Request const &request_) {
+  assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request;
@@ -330,11 +331,18 @@ RequestManager::RequestGuid
 
   // Load dataset
   if (request_.benchmarking_tokens >= 0) {
-    assert(request_.benchmarking_tokens == get_max_sequence_length());
+    assert(request_.benchmarking_tokens <= get_max_sequence_length());
     request.benchmarking_tokens = request_.benchmarking_tokens;
-    request.tokens.insert(request.tokens.end(),
-                          request_.benchmarking_tokens,
-                          15); // insert random number
+    std::vector<int32_t> input_tokens;
+    std::vector<int32_t> output_tokens;
+    bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON);
+    if (bos_added) {
+      input_tokens.push_back(bos_token_id);
+    }
+    input_tokens.insert(input_tokens.end(),
+                        request_.benchmarking_tokens - (int)bos_added,
+                        15); // insert random number
+    request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
   } else {
     using json = nlohmann::json;
     std::ifstream file_handle(request.dataset_filepath);
@@ -527,12 +535,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Step 1: append result from previous iteration to request's tokens
-  for (int i = 0; i < old_bc.num_active_infr_tokens(); i++) {
+  for (int i = 0; i < old_bc.num_active_tokens(); i++) {
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
-    assert(request.req_type == RequestType::REQ_INFERENCE &&
-           "Found misplaced finetuning request");
+    if (request.req_type == RequestType::REQ_FINETUNING) {
+      continue;
+    }
     if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
       // This is a prompt token
       continue;

From 33e873daa0de872eddf5cd31df7a45ee0a3c408a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 15 Apr 2024 08:48:30 -0700
Subject: [PATCH 170/198] add ttft statistics

---
 include/flexflow/request_manager.h |  2 ++
 src/runtime/request_manager.cc     | 46 +++++++++++++++++++++---------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 524d4828ec..ddf798d456 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -328,6 +328,8 @@ class RequestManager {
     int llm_decoding_steps;
     int ssm_decoding_steps;
     double start_time, finish_time;
+    double registration_time, first_token_time;
+    bool first_token_time_set = false;
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 26922e2e95..1d1d98fce9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -310,6 +310,11 @@ RequestManager::RequestGuid
   gr.output_text = request_.prompt;
   gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
+
+  ProfileInfo profile_info;
+  profile_info.registration_time = Realm::Clock::current_time_in_microseconds();
+  profiling_requests[request.guid] = profile_info;
+
   return request.guid;
 }
 
@@ -415,6 +420,11 @@ RequestManager::RequestGuid
   // gr.output_text = prompt;
   // gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
+
+  ProfileInfo profile_info;
+  profile_info.registration_time = Realm::Clock::current_time_in_microseconds();
+  profiling_requests[request.guid] = profile_info;
+
   return request.guid;
 }
 
@@ -546,9 +556,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       // This is a prompt token
       continue;
     } else {
+      // This is a decoding token
       assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 ==
              request.tokens.size());
-      // This is a decoding token
+      if (!profiling_requests[guid].first_token_time_set) {
+        profiling_requests[guid].first_token_time =
+            Realm::Clock::current_time_in_microseconds();
+        profiling_requests[guid].first_token_time_set = true;
+      }
       log_req_mgr.print("Output token is: %d", result.token_ids[i]);
       request.tokens.push_back(result.token_ids[i]);
       // std::string output = this->tokenizer_->Decode(request.tokens);
@@ -610,12 +625,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         profiling_requests[request.guid] = profile_info;
         log_req_mgr.print(
             "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
+            "finish(%.1lf) latency(%.1lf) ttft(%.1lf)",
             request.guid,
             profile_info.llm_decoding_steps,
             profile_info.start_time,
             profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
+            profile_info.finish_time - profile_info.start_time,
+            profile_info.first_token_time - profile_info.registration_time);
         // Write output to file if needed:
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
@@ -625,6 +641,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                        << profile_info.llm_decoding_steps << ") latency("
                        << std::fixed << std::setprecision(3)
                        << (profile_info.finish_time - profile_info.start_time)
+                       << ") ttft(" << std::fixed << std::setprecision(3)
+                       << (profile_info.first_token_time -
+                           profile_info.registration_time)
                        << ")\n";
             outputFile.close();
           } else {
@@ -717,11 +736,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].prompt_phase = true;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 1;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
+        // add start time to profile_info for the new request
+        profiling_requests[new_request.guid].llm_decoding_steps = 1;
+        profiling_requests[new_request.guid].start_time =
+            Realm::Clock::current_time_in_microseconds();
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
           int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
@@ -1233,13 +1251,13 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 0;
-        profile_info.ssm_decoding_steps = 0;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
+        profiling_requests[new_request.guid].llm_decoding_steps = 0;
+        profiling_requests[new_request.guid].ssm_decoding_steps = 0;
+        profiling_requests[new_request.guid].start_time =
+            Realm::Clock::current_time_in_microseconds();
         // init the beam search metadata per request
-        int ssm_decoding_steps = profile_info.ssm_decoding_steps;
+        int ssm_decoding_steps =
+            profiling_requests[new_request.guid].ssm_decoding_steps;
 
         new_bc.beamRequestsInfo[i].beam_size =
             spec_infer_tree_width.size() > ssm_decoding_steps

From 2f92a650289fa7ae2d4e0f201df27f1a31767e47 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 15 Apr 2024 11:23:40 -0700
Subject: [PATCH 171/198] add warmup phase

---
 include/flexflow/request_manager.h |  3 ++-
 inference/peft/peft.cc             | 32 ++++++++++++++++++++++++++----
 src/runtime/request_manager.cc     | 29 +++++++++++++++------------
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ddf798d456..e8e2e7eefc 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -83,6 +83,7 @@ struct Request {
   int dataset_entry_processed_tokens = 0;
   int max_training_steps = 1;
   int benchmarking_tokens = -1;
+  bool warmup = false;
   std::string dataset_filepath;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
                         std::vector<BatchConfig::TokenId>>>
@@ -136,7 +137,7 @@ class RequestManager {
   int get_max_sequence_length();
   void set_enable_peft_finetuning(bool enable_peft_finetuning_);
   void set_disable_peft_bwd(bool disable_peft_bwd_);
-  static void set_inference_finished();
+  static void set_inference_finished(bool finished = true);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 5c96709be7..030bf8167d 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -314,7 +314,34 @@ void FlexFlow::top_level_task(Task const *task,
   // Start background server
   rm->start_background_server(&model);
 
-  int total_num_requests = 0;
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 256;
+      inference_req.max_sequence_length = 1024;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Run workload
   {
     std::vector<Request> requests;
 
@@ -349,7 +376,6 @@ void FlexFlow::top_level_task(Task const *task,
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       requests.push_back(inference_req);
-      total_num_requests++;
     }
 
     // Add fine-tuning request
@@ -362,7 +388,6 @@ void FlexFlow::top_level_task(Task const *task,
     // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
     fine_tuning_req.max_training_steps = 1000000000;
     requests.push_back(fine_tuning_req);
-    total_num_requests++;
 
     std::vector<GenerationResult> result = model.generate(requests);
   }
@@ -380,7 +405,6 @@ void FlexFlow::top_level_task(Task const *task,
     free(peft_model_id);
   }
 
-  // float* data
   std::cout << "----------inference finished--------------" << std::endl;
 
   // free tokenizer space in memory
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1d1d98fce9..96b481edf0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -170,8 +170,8 @@ void RequestManager::set_disable_peft_bwd(bool disable_peft_bwd_) {
   disable_peft_bwd = disable_peft_bwd_;
 }
 
-void RequestManager::set_inference_finished() {
-  inference_finished = true;
+void RequestManager::set_inference_finished(bool finished) {
+  inference_finished = finished;
 }
 
 void RequestManager::register_tokenizer(ModelType type,
@@ -250,6 +250,7 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_sequence_length = request_.max_sequence_length;
   request.peft_model_id = request_.peft_model_id;
+  request.warmup = request_.warmup;
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
@@ -333,6 +334,7 @@ RequestManager::RequestGuid
   request.completed_training_steps = 0;
   request.max_training_steps = request_.max_training_steps;
   request.dataset_filepath = request_.dataset_filepath;
+  request.warmup = request_.warmup;
 
   // Load dataset
   if (request_.benchmarking_tokens >= 0) {
@@ -623,21 +625,22 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         total_request_run_time +=
             profile_info.finish_time - profile_info.start_time;
         profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf) ttft(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time,
-            profile_info.first_token_time - profile_info.registration_time);
+        log_req_mgr.print("[%s] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
+                          "finish(%.1lf) latency(%.1lf) ttft(%.1lf)",
+                          request.warmup ? "Warmup" : "Profile",
+                          request.guid,
+                          profile_info.llm_decoding_steps,
+                          profile_info.start_time,
+                          profile_info.finish_time,
+                          profile_info.finish_time - profile_info.start_time,
+                          profile_info.first_token_time -
+                              profile_info.registration_time);
         // Write output to file if needed:
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
           if (outputFile.is_open()) {
-            outputFile << "[Profile] guid(" << request.guid
-                       << ") llm_decoding_steps("
+            outputFile << "[" << (request.warmup ? "Warmup" : "Profile")
+                       << "] guid(" << request.guid << ") llm_decoding_steps("
                        << profile_info.llm_decoding_steps << ") latency("
                        << std::fixed << std::setprecision(3)
                        << (profile_info.finish_time - profile_info.start_time)

From b1e97b190067c983308472a46a5b3cf4ec86bb7c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 15 Apr 2024 22:50:03 -0700
Subject: [PATCH 172/198] add benchmarking code

---
 inference/peft/CMakeLists.txt        |  95 ++++++-
 inference/peft/peft.cc               |   4 +-
 inference/peft/peft_bwd_benchmark.cc | 403 +++++++++++++++++++++++++++
 inference/peft/peft_fwd_benchmark.cc | 375 +++++++++++++++++++++++++
 src/runtime/request_manager.cc       |   4 +-
 5 files changed, 864 insertions(+), 17 deletions(-)
 create mode 100644 inference/peft/peft_bwd_benchmark.cc
 create mode 100644 inference/peft/peft_fwd_benchmark.cc

diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
index 4547907176..9595f691f6 100644
--- a/inference/peft/CMakeLists.txt
+++ b/inference/peft/CMakeLists.txt
@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.10)
 
 project(FlexFlow_Peft)
-set(project_target peft)
 
-
-set(CPU_SRC
+# Normal PEFT
+set(project_target1 peft)
+set(CPU_SRC1
   ${FLEXFLOW_CPP_DRV_SRC}
   peft.cc
   ../models/llama.cc
@@ -14,25 +14,92 @@ set(CPU_SRC
   ../models/mpt.cc)
 
 if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
-  cuda_add_executable(${project_target} ${CPU_SRC})
+  cuda_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})
+
+# FWD benchmark
+set(project_target2 peft_fwd_benchmark)
+set(CPU_SRC2
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft_fwd_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target2} ${CPU_SRC2})
   if (FF_GPU_BACKEND STREQUAL "hip_cuda")
-    target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
+    target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__)
   endif()
 elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
-  set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
-  hip_add_executable(${project_target} ${CPU_SRC})
+  set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target2} ${CPU_SRC2})
   if (FF_HIP_ARCH STREQUAL "")
     message(FATAL_ERROR "FF_HIP_ARCH is empty!")
   endif()
-  set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
-  target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
+  set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__)
 else()
-  message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
+  message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported")
 endif()
 
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target2} DESTINATION ${BIN_DEST})
+
+# BWD benchmark
+set(project_target3 peft_bwd_benchmark)
+set(CPU_SRC3
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft_bwd_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
 
+target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
 set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
+install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 030bf8167d..ab2f9496bf 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -319,8 +319,8 @@ void FlexFlow::top_level_task(Task const *task,
     std::vector<Request> requests;
     for (int i = 0; i < 100; i++) {
       Request inference_req;
-      inference_req.benchmarking_tokens = 256;
-      inference_req.max_sequence_length = 1024;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
new file mode 100644
index 0000000000..a5f451350e
--- /dev/null
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -0,0 +1,403 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+LegionRuntime::Logger::Category log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_requests_to_run,
+                      bool &enable_peft_finetuning,
+                      bool &disable_peft_bwd) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
+      enable_peft_finetuning = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-disable-peft-bwd")) {
+      disable_peft_bwd = true;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    paths.cache_folder_path = "~/.cache/flexflow";
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+  bool disable_peft_bwd = false;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_requests_to_run,
+                   enable_peft_finetuning,
+                   disable_peft_bwd);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+  rm->set_disable_peft_bwd(disable_peft_bwd);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    std::vector<int> lengths;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+        if (index == max_requests_to_run) {
+            break;
+        }
+        int prompt_length = entry.get<int>();
+        assert(prompt_length > 0 && "Prompt length must be greater than 0.");
+        assert(prompt_length <= 1024 &&
+                "Prompt length must be less than or equal to 1024.");
+        lengths.push_back(prompt_length);
+        index++;
+    }
+    printf("Total number of finetuning requests: %d", lengths.size());
+
+    // Add fine-tuning requests
+    for (int i = 0; i < lengths.size(); i++) {
+        Request fine_tuning_req;
+        fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+        fine_tuning_req.benchmarking_tokens = lengths[i];
+        fine_tuning_req.max_sequence_length = lengths[i];
+        fine_tuning_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        fine_tuning_req.max_training_steps = 1;
+        requests.push_back(fine_tuning_req);
+    }
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------finetuning finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
new file mode 100644
index 0000000000..215b2f80f4
--- /dev/null
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -0,0 +1,375 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+LegionRuntime::Logger::Category log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_requests_to_run,
+                      bool &enable_peft_finetuning,
+                      bool &disable_peft_bwd) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
+      enable_peft_finetuning = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-disable-peft-bwd")) {
+      disable_peft_bwd = true;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    paths.cache_folder_path = "~/.cache/flexflow";
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+  bool disable_peft_bwd = false;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_requests_to_run,
+                   enable_peft_finetuning,
+                   disable_peft_bwd);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+  rm->set_disable_peft_bwd(disable_peft_bwd);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    std::vector<std::pair<int, int>> prompts;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+      if (index >= max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry["human"];
+      int sequence_length = entry["gpt"];
+      assert(prompt_length + sequence_length <= max_sequence_length &&
+             "Prompt + sequence length exceeds max sequence length");
+      prompts.push_back(std::make_pair(prompt_length, sequence_length));
+      index++;
+    }
+    printf("Total number of prompts: %d", prompts.size());
+    for (auto &prompt : prompts) {
+      // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+      // sequence_length);
+      Request inference_req;
+      inference_req.benchmarking_tokens = prompt.first;
+      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 96b481edf0..eee13c4cc6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2670,7 +2670,9 @@ std::vector<GenerationResult>
   for (int i = 0; i < inf_guids.size(); i++) {
     results.push_back(rm->get_generation_result(inf_guids[i]));
   }
-  rm->set_inference_finished();
+  if (inf_guids.size() > 0) {
+    rm->set_inference_finished();
+  }
   for (int i = 0; i < peft_guids.size(); i++) {
     results.push_back(rm->get_generation_result(peft_guids[i]));
   }

From e35ebb2ced300bd22b43220518a05db0d1eb78ca Mon Sep 17 00:00:00 2001
From: Remi <54138269+Flechman@users.noreply.github.com>
Date: Wed, 17 Apr 2024 04:56:35 -0400
Subject: [PATCH 173/198] Add scripts for evaluation with Microsoft Azure trace
 (#1363)

* Add scripts for evaluation

* Add absolute request rate value

* Fix script for target arrival rate

* Fix cpp req rate benchmark

* update to use new dataset

* Fix infinite loop

* update

* add data

---------

Co-authored-by: Remi Delacourt <rdelacou@catalyst-0-9.eth>
Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
---
 include/flexflow/request_manager.h   |   1 +
 inference/peft/CMakeLists.txt        |  34 ++
 inference/peft/req_rate_benchmark.cc | 530 +++++++++++++++++++++++++++
 rdelacou/generate_trace.py           | 121 ++++++
 src/runtime/request_manager.cc       |  19 +-
 5 files changed, 702 insertions(+), 3 deletions(-)
 create mode 100644 inference/peft/req_rate_benchmark.cc
 create mode 100644 rdelacou/generate_trace.py

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e8e2e7eefc..ba8a5833ee 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -83,6 +83,7 @@ struct Request {
   int dataset_entry_processed_tokens = 0;
   int max_training_steps = 1;
   int benchmarking_tokens = -1;
+  std::vector<int>finetuning_tokens_per_batch;
   bool warmup = false;
   std::string dataset_filepath;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
index 9595f691f6..e0bad79cab 100644
--- a/inference/peft/CMakeLists.txt
+++ b/inference/peft/CMakeLists.txt
@@ -103,3 +103,37 @@ target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/infere
 target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
 set(BIN_DEST "bin")
 install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})
+
+# Online peft
+set(project_target4 req_rate_benchmark)
+set(CPU_SRC4
+  ${FLEXFLOW_CPP_DRV_SRC}
+  req_rate_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target4} ${CPU_SRC4})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC4} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target4} ${CPU_SRC4})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target4} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target4} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target4} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target4} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target4} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target4} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
new file mode 100644
index 0000000000..bc40de68f8
--- /dev/null
+++ b/inference/peft/req_rate_benchmark.cc
@@ -0,0 +1,530 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "inference/models/falcon.h"
+#include "inference/models/llama.h"
+#include "inference/models/mpt.h"
+#include "inference/models/opt.h"
+#include "inference/models/starcoder.h"
+#include <chrono>
+#include <thread>
+#include <mutex>
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+LegionRuntime::Logger::Category log_app("llama");
+
+class ConcurrentQueue {
+public:
+  std::queue<RequestManager::RequestGuid> inf_queue;
+  std::queue<RequestManager::RequestGuid> peft_queue;
+  std::mutex request_queue_mutex;
+  bool producer_finished = false;
+};
+
+ConcurrentQueue *common_guids_singleton = nullptr;
+int nb_millisecs = 1000; // Default bucket timeframe is 1 second
+
+ConcurrentQueue *get_common_guids_queue() {
+  if (common_guids_singleton == nullptr) {
+    common_guids_singleton = new ConcurrentQueue();
+  }
+  return common_guids_singleton;
+}
+
+void consume() {
+  RequestManager *rm = RequestManager::get_request_manager();
+  ConcurrentQueue *guids = get_common_guids_queue();
+  bool producer_is_finished = false;
+  bool queue_is_empty = false;
+  // int i=0;
+  while(!producer_is_finished || !queue_is_empty) {
+    RequestManager::RequestGuid guid = RequestManager::INVALID_GUID;
+    {
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      queue_is_empty = guids->inf_queue.empty();
+      producer_is_finished = guids->producer_finished;
+      if (!queue_is_empty) {
+        guid = guids->inf_queue.front();
+        guids->inf_queue.pop();
+      }
+    }
+    if (guid != RequestManager::INVALID_GUID) {
+      GenerationResult result = rm->get_generation_result(guid);
+    } else {
+      std::this_thread::sleep_for(std::chrono::milliseconds(nb_millisecs));
+    }
+    // i++;
+    // cout << "Iteration " << i;
+  }
+  rm->set_inference_finished();
+  
+  while (guids->peft_queue.size() > 0) {
+    GenerationResult result = rm->get_generation_result(guids->peft_queue.front());
+    guids->peft_queue.pop();
+  }
+}
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_buckets_to_run,
+                      bool &enable_peft_finetuning,
+                      bool &disable_peft_bwd,
+                      int &bucket_timeframe) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-buckets-to-run")) {
+      max_buckets_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
+      enable_peft_finetuning = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-disable-peft-bwd")) {
+      disable_peft_bwd = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--bucket-timeframe")) {
+      bucket_timeframe = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_buckets_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+  bool disable_peft_bwd = false;
+  int bucket_timespan = 1;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_buckets_to_run,
+                   enable_peft_finetuning,
+                   disable_peft_bwd,
+                   bucket_timespan);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+  rm->set_disable_peft_bwd(disable_peft_bwd);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  rm->start_background_server(&model);
+
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Now run online workload!
+
+  nb_millisecs = nb_millisecs * bucket_timespan;
+  int total_num_requests = 0;
+  int num_arrival_buckets = 0;
+  ConcurrentQueue *guids = get_common_guids_queue();
+  std::thread consumer{consume};
+  {
+
+    // Load all requests in advance
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    
+    const auto& lists = prompt_json.get<std::vector<std::vector<json>>>();
+    std::vector<size_t> bucket_arrival_times_s;
+    std::vector<std::vector<std::pair<int, int>>> buckets;
+
+    size_t index=0;
+    for (const auto& list : lists) {
+      if (!list.empty()) {
+        bucket_arrival_times_s.push_back(index);
+        std::vector<std::pair<int, int>> prompts;
+        for (const auto& dict : list) {
+          int prompt_length = dict["human"];
+          int sequence_length = dict["gpt"];
+          assert(prompt_length + sequence_length <= max_sequence_length &&
+             "Prompt + sequence length exceeds max sequence length");
+          prompts.push_back(std::make_pair(prompt_length, sequence_length));
+        }
+        buckets.push_back(prompts);
+      }
+      index++;
+    }
+    assert(bucket_arrival_times_s.size() == buckets.size() &&
+           "Bucket arrival times and buckets are not the same size");
+    // for (int i=0; i<10; i++) {
+    //   printf("bucket_arrival_times_s[%i]: %i\n", i, bucket_arrival_times_s[i]);
+    //   printf("bucket[%i]: %i\n", i, buckets[i].size());
+    //   for (const auto& prompt : buckets[i]) {
+    //     printf("\tprompt: %i, %i\n", prompt.first, prompt.second);
+    //   }
+    // }
+
+    // Add fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1000000000;
+    RequestManager::RequestGuid ft_guid = rm->register_new_peft_request(fine_tuning_req);
+    if (ft_guid != RequestManager::INVALID_GUID) {
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      guids->peft_queue.push(ft_guid);
+    }
+
+    // Replay the trace of inference requests
+    auto start_time = std::chrono::steady_clock::now();
+    for (int i=0; i<bucket_arrival_times_s.size(); i++) {
+      if (bucket_arrival_times_s[i] >= max_buckets_to_run) {
+        break;
+      }
+      // sleep until bucket arrives
+      auto bucket_arrival_time = start_time + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs);
+      std::this_thread::sleep_until(bucket_arrival_time);
+
+      // create inference requests for the bucket
+      std::vector<Request> requests;
+      for (const auto& prompt : buckets[i]) {
+        // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+        // sequence_length);
+        Request inference_req;
+        inference_req.benchmarking_tokens = prompt.first;
+        inference_req.max_sequence_length = prompt.second + prompt.first;
+        inference_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        requests.push_back(inference_req);
+      }
+
+      {
+        const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+        for (int i = 0; i < requests.size(); i++) {
+          RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i));
+          if (guid != RequestManager::INVALID_GUID) {
+            guids->inf_queue.push(guid);
+          }
+        }
+      }
+    }
+      
+    { // Notify the consumer that no more requests are incoming
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      guids->producer_finished = true;
+    }
+  }
+
+  // Wait for consumer to finish
+  consumer.join();
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/rdelacou/generate_trace.py b/rdelacou/generate_trace.py
new file mode 100644
index 0000000000..986dab37df
--- /dev/null
+++ b/rdelacou/generate_trace.py
@@ -0,0 +1,121 @@
+import pandas as pd
+from math import ceil
+from random import shuffle, uniform
+import json, pickle, requests, os, argparse
+
+class TraceBuilder(object):
+
+  # trace_type: either "conv" or "code"
+  def __init__(self, import_times=True, import_prompts=True):
+    self.req_times = None
+    self.imported_req_times = False
+    self.prompt_data = None
+    self.imported_prompt_data = False
+    if import_times:
+      self.import_trace_timestamps()
+    if import_prompts:
+      self.import_prompt_data()
+
+  def import_trace_timestamps(self, trace_type="conv"):
+    if not self.imported_req_times:
+      # Import Microsoft LLM 1 hour trace
+      df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"])
+      req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds
+      req_times = req_times - req_times.min()
+      self.req_times = req_times.tolist()
+      self.imported_req_times = True
+  
+  def import_prompt_data(self, shuffle_=True):
+    if not self.imported_prompt_data:
+      sharegpt_filename = "sharegpt_opt_text_completion_length.pkl"
+      sharegpt_filepath = f"./{sharegpt_filename}"
+      if os.path.exists(sharegpt_filepath):
+        os.remove("sharegpt_opt_text_completion_length.pkl")
+      sharegpt_url = f"https://github.com/sosp-ae-39/sosp-ae-astra/raw/main/datasets/{sharegpt_filename}"
+      response = requests.get(sharegpt_url)
+      with open(sharegpt_filename, "wb") as file:
+        file.write(response.content)
+      with open(sharegpt_filepath, 'rb') as f:
+        data2 = pickle.load(f)
+      os.remove("sharegpt_opt_text_completion_length.pkl")
+
+      prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+
+      for pair in data2:
+        assert(len(pair) == 2)
+
+      prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      num_pairs = len(prompt_lengths)
+      assert(num_pairs == len(generation_lengths))
+      print("Number of conversation pairs: ", num_pairs)
+
+      print(f"Prompt lengths: min={min(prompt_lengths)}, max={max(prompt_lengths)}, avg={sum(prompt_lengths)/len(prompt_lengths)}")
+      print(f"Generation lengths: min={min(generation_lengths)}, max={max(generation_lengths)}, avg={sum(generation_lengths)/len(generation_lengths)}")
+      total_lengths = [prompt_lengths[i] + generation_lengths[i] for i in range(len(prompt_lengths))]
+      print(f"Total lengths: min={min(total_lengths)}, max={max(total_lengths)}, avg={sum(total_lengths)/len(total_lengths)}")
+
+      self.prompt_data = [{"human": prompt_lengths[i], "gpt": generation_lengths[i]} for i in range(num_pairs)]
+        
+      if shuffle_:
+        shuffle(self.prompt_data)
+      self.imported_prompt_data = True
+
+  # Delta is in seconds
+  # Rate is in req per second
+  def generate_trace(self, target_arrival_rate=10, debug_verbose=False):
+    self.import_trace_timestamps()
+    self.import_prompt_data()
+
+    microsec = 1000000
+    avg_arrival_rate = len(self.req_times) / (self.req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude
+    if debug_verbose:
+      print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate)
+    scale_factor = float(target_arrival_rate) / avg_arrival_rate
+    if debug_verbose:
+      print("Scale factor to obtain target arrival rate: ", scale_factor)
+
+    # Buckets are 1 second timeframes
+    nb_buckets = ceil(self.req_times[-1] / microsec)
+    buckets = []
+    j = 0
+    k = 0
+    for i in range(nb_buckets):
+      bucket_size = 0
+      while(j < len(self.req_times) and self.req_times[j] >= i*microsec and self.req_times[j] < (i+1)*microsec):
+        bucket_size += 1
+        j += 1
+      bucket_size = bucket_size*scale_factor
+      prob = bucket_size - int(bucket_size)
+      bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob)
+      
+      # If used all of the prompt data, loop back at the beggining and reuse some prompts
+      if k+bucket_size > len(self.prompt_data):
+        bucket = self.prompt_data[k:] + self.prompt_data[:(k+bucket_size)%len(self.prompt_data)]
+      else:
+        bucket = self.prompt_data[k:k+bucket_size]
+      k = (k+bucket_size) % len(self.prompt_data)
+      buckets.append(bucket)
+
+    if debug_verbose:
+      print("Avg arrival rate obtained (req/s): ", sum([len(b) for b in buckets])/len(buckets))
+    return buckets
+
+def generate_and_save_trace(arrival_rate, output_file):
+  builder = TraceBuilder()
+  trace = builder.generate_trace(target_arrival_rate=arrival_rate, debug_verbose=True)
+  with open(output_file, 'w+') as f:
+    json.dump(trace, f, indent=2)
+
+if __name__ == '__main__':
+  # Set up the argument parser
+  parser = argparse.ArgumentParser(description='Generate and save a trace.')
+  parser.add_argument('--arrival-rate', type=float, default=10.0, help='The target arrival rate for the trace.')
+  parser.add_argument('--output-file', type=str, default='sharegpt.json', help='The path to the output file to save the trace.')
+
+  # Parse the command-line arguments
+  args = parser.parse_args()
+
+  # Call the function with the user-provided arrival rate
+  generate_and_save_trace(args.arrival_rate, args.output_file)
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index eee13c4cc6..b1ca4d985a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -771,6 +771,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
     request.processed_finetuning_tokens +=
         old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    request.finetuning_tokens_per_batch.push_back(
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch);
     int dataset_entry =
         request.completed_training_steps % request.dataset.size();
     if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request +
@@ -798,8 +800,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       total_request_run_time +=
           profile_info.finish_time - profile_info.start_time;
       profiling_requests[request.guid] = profile_info;
-      log_req_mgr.print("[Finetuning] guid(%zu) completed_training_steps(%d) "
+      log_req_mgr.print("[%s] guid(%zu) completed_training_steps(%d) "
                         "processed_finetuning_tokens(%lu) latency(%.1lf)",
+                        request.warmup ? "Warmup" : "Finetuning",
                         request.guid,
                         request.completed_training_steps,
                         request.processed_finetuning_tokens,
@@ -807,14 +810,24 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       if (!output_filepath.empty()) {
         std::ofstream outputFile(output_filepath, std::ios::app);
         if (outputFile.is_open()) {
-          outputFile << "[Finetuning] guid(" << request.guid
+          std::string tokens_str = "[";
+          for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); i++) {
+            tokens_str += std::to_string(request.finetuning_tokens_per_batch[i]);
+            if (i != request.finetuning_tokens_per_batch.size() - 1) {
+              tokens_str += ", ";
+            }
+          }
+          tokens_str += "]";
+          outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") 
+                     << "] guid(" << request.guid
                      << ") completed_training_steps("
                      << request.completed_training_steps
                      << ") processed_finetuning_tokens("
                      << request.processed_finetuning_tokens << ") latency("
                      << std::fixed << std::setprecision(3)
                      << (profile_info.finish_time - profile_info.start_time)
-                     << ")\n";
+                     << ") tokens_per_batch("
+                     << tokens_str << ")\n";
           outputFile.close();
         } else {
           std::cout << "Unable to open the output file: " << output_filepath

From b33f10f4015db431b093adfbdae0dd35872242d3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 25 Apr 2024 04:04:59 +0000
Subject: [PATCH 174/198] fix

---
 include/flexflow/request_manager.h    |   2 +-
 inference/peft/peft.cc                | 115 +++++++++-----------------
 inference/peft/peft_bwd_benchmark.cc  |  34 ++++----
 inference/peft/req_rate_benchmark.cc  |  46 ++++++-----
 python/flexflow/core/flexflow_cffi.py |  17 ++--
 src/runtime/request_manager.cc        |  11 +--
 tests/peft_test.sh                    |  13 ++-
 7 files changed, 109 insertions(+), 129 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ba8a5833ee..729f1b480c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -83,7 +83,7 @@ struct Request {
   int dataset_entry_processed_tokens = 0;
   int max_training_steps = 1;
   int benchmarking_tokens = -1;
-  std::vector<int>finetuning_tokens_per_batch;
+  std::vector<int> finetuning_tokens_per_batch;
   bool warmup = false;
   std::string dataset_filepath;
   std::vector<std::pair<std::vector<BatchConfig::TokenId>,
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index ab2f9496bf..e3503d98ee 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -33,6 +33,7 @@ LegionRuntime::Logger::Category log_app("llama");
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
+  std::string dataset_file_path;
   std::string output_file_path;
 };
 
@@ -50,7 +51,6 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &max_requests_to_run,
                       bool &enable_peft_finetuning,
                       bool &disable_peft_bwd) {
   for (int i = 1; i < argc; i++) {
@@ -83,6 +83,11 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
+    // dataset for finetuning
+    if (!strcmp(argv[i], "-finetuning-dataset")) {
+      paths.dataset_file_path = std::string(argv[++i]);
+      continue;
+    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -121,10 +126,6 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "--max-requests-to-run")) {
-      max_requests_to_run = std::stoi(argv[++i]);
-      continue;
-    }
     if (!strcmp(argv[i], "-enable-peft-finetuning")) {
       enable_peft_finetuning = true;
       continue;
@@ -160,11 +161,10 @@ void FlexFlow::top_level_task(Task const *task,
   bool enable_peft = false;
   float temperature = 0.0f;
   float topp = 0.0f;
-  int max_requests_per_batch = 8;
+  int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
-  int max_requests_to_run = 1000000000;
-  bool enable_peft_finetuning = false;
+  bool enable_peft_finetuning = true;
   bool disable_peft_bwd = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -184,7 +184,6 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   max_requests_to_run,
                    enable_peft_finetuning,
                    disable_peft_bwd);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
@@ -314,81 +313,47 @@ void FlexFlow::top_level_task(Task const *task,
   // Start background server
   rm->start_background_server(&model);
 
-  // Warmup stage
-  {
-    std::vector<Request> requests;
-    for (int i = 0; i < 100; i++) {
-      Request inference_req;
-      inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
-      inference_req.warmup = true;
-      inference_req.peft_model_id =
-          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-      requests.push_back(inference_req);
-    }
-    Request fine_tuning_req;
-    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-    fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
-    fine_tuning_req.warmup = true;
-    fine_tuning_req.peft_model_id =
-        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-    fine_tuning_req.max_training_steps = 1;
-    requests.push_back(fine_tuning_req);
-    std::vector<GenerationResult> result = model.generate(requests);
-  }
-
-  rm->set_inference_finished(false); // reset inference finished flag
-  std::cout << "----------warmup finished--------------" << std::endl;
-
   // Run workload
   {
     std::vector<Request> requests;
 
     // Add inference requests
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
-    std::vector<std::pair<int, int>> prompts;
-    int index = 0;
-    for (auto &entry : prompt_json) {
-      if (index >= max_requests_to_run) {
-        break;
+    if (!file_paths.prompt_file_path.empty()) {
+      using json = nlohmann::json;
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      int total_num_requests = 0;
+      for (auto &prompt : prompt_json) {
+        std::string text = prompt.get<std::string>();
+        printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
+        Request inference_req;
+        inference_req.prompt = text;
+        inference_req.max_sequence_length = 128;
+        inference_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        requests.push_back(inference_req);
+        total_num_requests++;
       }
-      int prompt_length = entry["human"];
-      int sequence_length = entry["gpt"];
-      assert(prompt_length + sequence_length <= max_sequence_length &&
-             "Prompt + sequence length exceeds max sequence length");
-      prompts.push_back(std::make_pair(prompt_length, sequence_length));
-      index++;
-    }
-    printf("Total number of prompts: %d", prompts.size());
-    for (auto &prompt : prompts) {
-      // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
-      // sequence_length);
-      Request inference_req;
-      inference_req.benchmarking_tokens = prompt.first;
-      inference_req.max_sequence_length = prompt.second + prompt.first;
-      inference_req.peft_model_id =
-          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-      requests.push_back(inference_req);
     }
 
     // Add fine-tuning request
-    Request fine_tuning_req;
-    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-    fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
-    fine_tuning_req.peft_model_id =
-        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-    // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path;
-    fine_tuning_req.max_training_steps = 1000000000;
-    requests.push_back(fine_tuning_req);
-
+    if (enable_peft_finetuning) {
+      assert(!file_paths.dataset_file_path.empty() &&
+             "Dataset file path is required for fine-tuning.");
+      printf("Finetuning request with dataset %s\n",
+             file_paths.dataset_file_path.c_str());
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      fine_tuning_req.dataset_filepath = file_paths.dataset_file_path;
+      fine_tuning_req.max_training_steps = 1;
+      requests.push_back(fine_tuning_req);
+    }
     std::vector<GenerationResult> result = model.generate(requests);
   }
 
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index a5f451350e..72ebe87227 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -356,28 +356,28 @@ void FlexFlow::top_level_task(Task const *task,
     std::vector<int> lengths;
     int index = 0;
     for (auto &entry : prompt_json) {
-        if (index == max_requests_to_run) {
-            break;
-        }
-        int prompt_length = entry.get<int>();
-        assert(prompt_length > 0 && "Prompt length must be greater than 0.");
-        assert(prompt_length <= 1024 &&
-                "Prompt length must be less than or equal to 1024.");
-        lengths.push_back(prompt_length);
-        index++;
+      if (index == max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry.get<int>();
+      assert(prompt_length > 0 && "Prompt length must be greater than 0.");
+      assert(prompt_length <= 1024 &&
+             "Prompt length must be less than or equal to 1024.");
+      lengths.push_back(prompt_length);
+      index++;
     }
     printf("Total number of finetuning requests: %d", lengths.size());
 
     // Add fine-tuning requests
     for (int i = 0; i < lengths.size(); i++) {
-        Request fine_tuning_req;
-        fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-        fine_tuning_req.benchmarking_tokens = lengths[i];
-        fine_tuning_req.max_sequence_length = lengths[i];
-        fine_tuning_req.peft_model_id =
-            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-        fine_tuning_req.max_training_steps = 1;
-        requests.push_back(fine_tuning_req);
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.benchmarking_tokens = lengths[i];
+      fine_tuning_req.max_sequence_length = lengths[i];
+      fine_tuning_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      fine_tuning_req.max_training_steps = 1;
+      requests.push_back(fine_tuning_req);
     }
     std::vector<GenerationResult> result = model.generate(requests);
   }
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index bc40de68f8..08b087faed 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -21,13 +21,12 @@
 #include "inference/models/opt.h"
 #include "inference/models/starcoder.h"
 #include <chrono>
-#include <thread>
 #include <mutex>
+#include <thread>
 #include <wordexp.h>
 
 #include <nlohmann/json.hpp>
 
-
 using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
@@ -58,7 +57,7 @@ void consume() {
   bool producer_is_finished = false;
   bool queue_is_empty = false;
   // int i=0;
-  while(!producer_is_finished || !queue_is_empty) {
+  while (!producer_is_finished || !queue_is_empty) {
     RequestManager::RequestGuid guid = RequestManager::INVALID_GUID;
     {
       const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
@@ -78,9 +77,10 @@ void consume() {
     // cout << "Iteration " << i;
   }
   rm->set_inference_finished();
-  
+
   while (guids->peft_queue.size() > 0) {
-    GenerationResult result = rm->get_generation_result(guids->peft_queue.front());
+    GenerationResult result =
+        rm->get_generation_result(guids->peft_queue.front());
     guids->peft_queue.pop();
   }
 }
@@ -422,21 +422,21 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-    
-    const auto& lists = prompt_json.get<std::vector<std::vector<json>>>();
+
+    auto const &lists = prompt_json.get<std::vector<std::vector<json>>>();
     std::vector<size_t> bucket_arrival_times_s;
     std::vector<std::vector<std::pair<int, int>>> buckets;
 
-    size_t index=0;
-    for (const auto& list : lists) {
+    size_t index = 0;
+    for (auto const &list : lists) {
       if (!list.empty()) {
         bucket_arrival_times_s.push_back(index);
         std::vector<std::pair<int, int>> prompts;
-        for (const auto& dict : list) {
+        for (auto const &dict : list) {
           int prompt_length = dict["human"];
           int sequence_length = dict["gpt"];
           assert(prompt_length + sequence_length <= max_sequence_length &&
-             "Prompt + sequence length exceeds max sequence length");
+                 "Prompt + sequence length exceeds max sequence length");
           prompts.push_back(std::make_pair(prompt_length, sequence_length));
         }
         buckets.push_back(prompts);
@@ -446,9 +446,9 @@ void FlexFlow::top_level_task(Task const *task,
     assert(bucket_arrival_times_s.size() == buckets.size() &&
            "Bucket arrival times and buckets are not the same size");
     // for (int i=0; i<10; i++) {
-    //   printf("bucket_arrival_times_s[%i]: %i\n", i, bucket_arrival_times_s[i]);
-    //   printf("bucket[%i]: %i\n", i, buckets[i].size());
-    //   for (const auto& prompt : buckets[i]) {
+    //   printf("bucket_arrival_times_s[%i]: %i\n", i,
+    //   bucket_arrival_times_s[i]); printf("bucket[%i]: %i\n", i,
+    //   buckets[i].size()); for (const auto& prompt : buckets[i]) {
     //     printf("\tprompt: %i, %i\n", prompt.first, prompt.second);
     //   }
     // }
@@ -461,7 +461,8 @@ void FlexFlow::top_level_task(Task const *task,
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
     fine_tuning_req.max_training_steps = 1000000000;
-    RequestManager::RequestGuid ft_guid = rm->register_new_peft_request(fine_tuning_req);
+    RequestManager::RequestGuid ft_guid =
+        rm->register_new_peft_request(fine_tuning_req);
     if (ft_guid != RequestManager::INVALID_GUID) {
       const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
       guids->peft_queue.push(ft_guid);
@@ -469,17 +470,19 @@ void FlexFlow::top_level_task(Task const *task,
 
     // Replay the trace of inference requests
     auto start_time = std::chrono::steady_clock::now();
-    for (int i=0; i<bucket_arrival_times_s.size(); i++) {
+    for (int i = 0; i < bucket_arrival_times_s.size(); i++) {
       if (bucket_arrival_times_s[i] >= max_buckets_to_run) {
         break;
       }
       // sleep until bucket arrives
-      auto bucket_arrival_time = start_time + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs);
+      auto bucket_arrival_time =
+          start_time +
+          std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs);
       std::this_thread::sleep_until(bucket_arrival_time);
 
       // create inference requests for the bucket
       std::vector<Request> requests;
-      for (const auto& prompt : buckets[i]) {
+      for (auto const &prompt : buckets[i]) {
         // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
         // sequence_length);
         Request inference_req;
@@ -493,14 +496,15 @@ void FlexFlow::top_level_task(Task const *task,
       {
         const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
         for (int i = 0; i < requests.size(); i++) {
-          RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i));
+          RequestManager::RequestGuid guid =
+              rm->register_new_request(requests.at(i));
           if (guid != RequestManager::INVALID_GUID) {
             guids->inf_queue.push(guid);
           }
         }
       }
     }
-      
+
     { // Notify the consumer that no more requests are incoming
       const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
       guids->producer_finished = true;
@@ -519,8 +523,6 @@ void FlexFlow::top_level_task(Task const *task,
     future.get_void_result();
   }
 
-  
-
   // float* data
   std::cout << "----------inference finished--------------" << std::endl;
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 82c3eb059c..b08fdba072 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1599,18 +1599,19 @@ def register_ssm_model(self, model):
 
     def set_max_requests_per_batch(self, max_requests):
         return ffc().flexflow_request_manager_set_max_requests_per_batch(
-            self.handle, max_requests
-        )
-
+            self.handle, max_requests)
+    
     def set_max_tokens_per_batch(self, max_tokens):
         return ffc().flexflow_request_manager_set_max_tokens_per_batch(
-            self.handle, max_tokens
-        )
-
+            self.handle, max_tokens)
+    
+    def set_max_spec_tree_token_num(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
+            self.handle, max_tokens)
+    
     def set_max_sequence_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_sequence_length(
-            self.handle, max_length
-        )
+            self.handle, max_length)
 
     def start_server(self, model):
         return ffc().flexflow_request_manager_start_background_server(
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b1ca4d985a..6a4d9658e0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -811,14 +811,16 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         std::ofstream outputFile(output_filepath, std::ios::app);
         if (outputFile.is_open()) {
           std::string tokens_str = "[";
-          for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); i++) {
-            tokens_str += std::to_string(request.finetuning_tokens_per_batch[i]);
+          for (size_t i = 0; i < request.finetuning_tokens_per_batch.size();
+               i++) {
+            tokens_str +=
+                std::to_string(request.finetuning_tokens_per_batch[i]);
             if (i != request.finetuning_tokens_per_batch.size() - 1) {
               tokens_str += ", ";
             }
           }
           tokens_str += "]";
-          outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") 
+          outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning")
                      << "] guid(" << request.guid
                      << ") completed_training_steps("
                      << request.completed_training_steps
@@ -826,8 +828,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      << request.processed_finetuning_tokens << ") latency("
                      << std::fixed << std::setprecision(3)
                      << (profile_info.finish_time - profile_info.start_time)
-                     << ") tokens_per_batch("
-                     << tokens_str << ")\n";
+                     << ") tokens_per_batch(" << tokens_str << ")\n";
           outputFile.close();
         } else {
           std::cout << "Unable to open the output file: " << output_filepath
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 9b4a5204ac..b32b69cd82 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -14,6 +14,8 @@ fi
 # Create test prompt file
 mkdir -p ../inference/prompt
 echo '["Two things are infinite: "]' > ../inference/prompt/peft.json
+echo "[\"“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.“\"]" > ../inference/prompt/peft_dataset.json
+
 
 # Create output folder
 mkdir -p ../inference/output
@@ -26,7 +28,16 @@ python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --
 # if first time, add: --refresh-cache
 
 # CPP test
-../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft
+../build/inference/peft/peft \
+    -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
+    -ll:fsize 8192 -ll:zsize 12000 \
+    -llm-model JackFram/llama-160m \
+    -finetuning-dataset ../inference/prompt/peft_dataset.json \
+    -peft-model goliaro/llama-160m-lora-full \
+    --use-full-precision \
+    --inference-debugging \
+    --fusion \
+    -enable-peft
 
 # Python test
 python ../inference/python/ff_peft.py

From 97562d6258c87d1bab39b9363b8348a3468d55c4 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 1 May 2024 22:47:55 +0000
Subject: [PATCH 175/198] fix

---
 include/flexflow/flexflow_c.h         |  3 +++
 include/flexflow/request_manager.h    |  2 --
 inference/peft/peft.cc                | 18 ++----------------
 inference/peft/peft_bwd_benchmark.cc  | 20 +++-----------------
 inference/peft/peft_fwd_benchmark.cc  | 20 +++-----------------
 inference/peft/req_rate_benchmark.cc  | 14 --------------
 inference/python/ff_peft.py           |  1 +
 python/flexflow/core/flexflow_cffi.py |  4 ++++
 python/flexflow/serve/serve.py        | 14 ++++++++------
 src/c/flexflow_c.cc                   |  8 ++++++++
 src/runtime/request_manager.cc        |  6 +-----
 tests/peft_test.sh                    |  4 ++--
 12 files changed, 35 insertions(+), 79 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index d6cdb910c4..b651b31052 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -989,6 +989,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num(
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 729f1b480c..fe0e4b2f9d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -137,7 +137,6 @@ class RequestManager {
   void push_spec_infer_tree_width(int tree_width);
   int get_max_sequence_length();
   void set_enable_peft_finetuning(bool enable_peft_finetuning_);
-  void set_disable_peft_bwd(bool disable_peft_bwd_);
   static void set_inference_finished(bool finished = true);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
@@ -287,7 +286,6 @@ class RequestManager {
 
   // peft benchmarking
   bool enable_peft_finetuning = false;
-  bool disable_peft_bwd = false;
   static bool inference_finished;
 
   // tree width in each speculative step, if not specified 1
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index e3503d98ee..f800b7f17c 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -50,9 +50,7 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
-                      int &max_sequence_length,
-                      bool &enable_peft_finetuning,
-                      bool &disable_peft_bwd) {
+                      int &max_sequence_length) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -126,14 +124,6 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
-      enable_peft_finetuning = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "-disable-peft-bwd")) {
-      disable_peft_bwd = true;
-      continue;
-    }
   }
   if (paths.cache_folder_path.empty()) {
     paths.cache_folder_path = "~/.cache/flexflow";
@@ -165,7 +155,6 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
   bool enable_peft_finetuning = true;
-  bool disable_peft_bwd = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -183,9 +172,7 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                   max_sequence_length,
-                   enable_peft_finetuning,
-                   disable_peft_bwd);
+                   max_sequence_length);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -264,7 +251,6 @@ void FlexFlow::top_level_task(Task const *task,
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
-  rm->set_disable_peft_bwd(disable_peft_bwd);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index 72ebe87227..c0d7d33ae4 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -50,9 +50,7 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &max_requests_to_run,
-                      bool &enable_peft_finetuning,
-                      bool &disable_peft_bwd) {
+                      int &max_requests_to_run) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -125,14 +123,6 @@ void parse_input_args(char **argv,
       max_requests_to_run = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
-      enable_peft_finetuning = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "-disable-peft-bwd")) {
-      disable_peft_bwd = true;
-      continue;
-    }
   }
   if (paths.cache_folder_path.empty()) {
     paths.cache_folder_path = "~/.cache/flexflow";
@@ -165,7 +155,6 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 256;
   int max_requests_to_run = 1000000000;
   bool enable_peft_finetuning = false;
-  bool disable_peft_bwd = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -184,9 +173,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   max_requests_to_run,
-                   enable_peft_finetuning,
-                   disable_peft_bwd);
+                   max_requests_to_run);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -265,7 +252,6 @@ void FlexFlow::top_level_task(Task const *task,
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
-  rm->set_disable_peft_bwd(disable_peft_bwd);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {
@@ -366,7 +352,7 @@ void FlexFlow::top_level_task(Task const *task,
       lengths.push_back(prompt_length);
       index++;
     }
-    printf("Total number of finetuning requests: %d", lengths.size());
+    printf("Total number of finetuning requests: %ld", lengths.size());
 
     // Add fine-tuning requests
     for (int i = 0; i < lengths.size(); i++) {
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
index 215b2f80f4..7be90e083a 100644
--- a/inference/peft/peft_fwd_benchmark.cc
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -50,9 +50,7 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &max_requests_to_run,
-                      bool &enable_peft_finetuning,
-                      bool &disable_peft_bwd) {
+                      int &max_requests_to_run) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -125,14 +123,6 @@ void parse_input_args(char **argv,
       max_requests_to_run = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
-      enable_peft_finetuning = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "-disable-peft-bwd")) {
-      disable_peft_bwd = true;
-      continue;
-    }
   }
   if (paths.cache_folder_path.empty()) {
     paths.cache_folder_path = "~/.cache/flexflow";
@@ -165,7 +155,6 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 256;
   int max_requests_to_run = 1000000000;
   bool enable_peft_finetuning = false;
-  bool disable_peft_bwd = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -184,9 +173,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   max_requests_to_run,
-                   enable_peft_finetuning,
-                   disable_peft_bwd);
+                   max_requests_to_run);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -265,7 +252,6 @@ void FlexFlow::top_level_task(Task const *task,
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
-  rm->set_disable_peft_bwd(disable_peft_bwd);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {
@@ -339,7 +325,7 @@ void FlexFlow::top_level_task(Task const *task,
       prompts.push_back(std::make_pair(prompt_length, sequence_length));
       index++;
     }
-    printf("Total number of prompts: %d", prompts.size());
+    printf("Total number of prompts: %ld", prompts.size());
     for (auto &prompt : prompts) {
       // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
       // sequence_length);
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index 08b087faed..3824b93840 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -106,8 +106,6 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
                       int &max_buckets_to_run,
-                      bool &enable_peft_finetuning,
-                      bool &disable_peft_bwd,
                       int &bucket_timeframe) {
   for (int i = 1; i < argc; i++) {
     // llm model type
@@ -181,14 +179,6 @@ void parse_input_args(char **argv,
       max_buckets_to_run = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft-finetuning")) {
-      enable_peft_finetuning = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "-disable-peft-bwd")) {
-      disable_peft_bwd = true;
-      continue;
-    }
     if (!strcmp(argv[i], "--bucket-timeframe")) {
       bucket_timeframe = std::stoi(argv[++i]);
       continue;
@@ -227,7 +217,6 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 256;
   int max_buckets_to_run = 1000000000;
   bool enable_peft_finetuning = false;
-  bool disable_peft_bwd = false;
   int bucket_timespan = 1;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -248,8 +237,6 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_batch,
                    max_sequence_length,
                    max_buckets_to_run,
-                   enable_peft_finetuning,
-                   disable_peft_bwd,
                    bucket_timespan);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -329,7 +316,6 @@ void FlexFlow::top_level_task(Task const *task,
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_enable_peft_finetuning(enable_peft_finetuning);
-  rm->set_disable_peft_bwd(disable_peft_bwd);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   if (model_type == ModelType::LLAMA) {
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 38a25fb614..caf7ce1774 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -109,6 +109,7 @@ def main():
     )
     llm.compile(
         generation_config,
+        enable_peft_finetuning = (len(configs.finetuning_dataset) > 0),
         max_requests_per_batch=1,
         max_seq_length=256,
         max_tokens_per_batch=64,
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index b08fdba072..ec4cacfa6d 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1612,6 +1612,10 @@ def set_max_spec_tree_token_num(self, max_tokens):
     def set_max_sequence_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_sequence_length(
             self.handle, max_length)
+    
+    def set_enable_peft_finetuning(self, enable_peft_finetuning):
+        return ffc().flexflow_request_manager_set_enable_peft_finetuning(
+            self.handle, enable_peft_finetuning)
 
     def start_server(self, model):
         return ffc().flexflow_request_manager_start_background_server(
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 1956946380..248fe55d93 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -349,6 +349,7 @@ def compile(
         max_requests_per_batch: int = 1,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 64,
+        enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = None,
         model_specific_tensor_parallelism_degree: int = None,
         model_specific_pipeline_parallelism_degree: int = None,
@@ -364,6 +365,8 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64
         :type max_tokens_per_batch: int, optional
+        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
+        :type enable_peft_finetuning: bool, optional
         :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None
         :type model_specific_data_parallelism_degree: int, optional
         :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None
@@ -373,9 +376,6 @@ def compile(
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
-        # self.max_requests_per_batch = max_requests_per_batch
-        # self.max_seq_length = max_seq_length
-        # self.max_tokens_per_batch = max_tokens_per_batch
         self.ssms = ssms
         self.generation_config = GenerationConfig()
         self.ffconfig = FFConfig()
@@ -407,6 +407,7 @@ def compile(
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
         self.rm.set_max_tokens_per_batch(max_tokens_per_batch)
         self.rm.set_max_sequence_length(max_seq_length)
+        self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
 
         # Instantiate the relevant model
         self.model = self.model_class(
@@ -560,15 +561,13 @@ def compile(
         max_requests_per_batch: int = 16,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 128,
+        enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = 1,
         model_specific_tensor_parallelism_degree: int = 1,
         model_specific_pipeline_parallelism_degree: int = 1,
         ssms: list = [],
     ):
         """Compile the SSM for inference and load the weights into memory
-
-        :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE
-        :type mode: InferenceMode, optional
         :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig()
         :type generation_config: GenerationConfig, optional
         :param max_requests_per_batch: The maximum batch size to allow, defaults to 16
@@ -577,6 +576,8 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128
         :type max_tokens_per_batch: int, optional
+        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
+        :type enable_peft_finetuning: bool, optional
         :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1
         :type model_specific_data_parallelism_degree: int, optional
         :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1
@@ -591,6 +592,7 @@ def compile(
             max_requests_per_batch,
             max_seq_length,
             max_tokens_per_batch,
+            enable_peft_finetuning,
             model_specific_data_parallelism_degree,
             model_specific_tensor_parallelism_degree,
             model_specific_pipeline_parallelism_degree,
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 3a6c18aa7b..993d1b6a0d 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2662,6 +2662,14 @@ void flexflow_request_manager_set_max_sequence_length(
   DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length);
 }
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_enable_peft_finetuning(enable_peft_finetuning_);
+  DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d",
+              enable_peft_finetuning_);
+}
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6a4d9658e0..e3c6e7c6f3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -166,10 +166,6 @@ void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) {
   enable_peft_finetuning = enable_peft_finetuning_;
 }
 
-void RequestManager::set_disable_peft_bwd(bool disable_peft_bwd_) {
-  disable_peft_bwd = disable_peft_bwd_;
-}
-
 void RequestManager::set_inference_finished(bool finished) {
   inference_finished = finished;
 }
@@ -2846,7 +2842,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime);
     FutureMap fm = im->inference(llm, 0, bcf);
-    if (llm->config.enable_peft && !disable_peft_bwd) {
+    if (llm->config.enable_peft) {
       im->peft_bwd(llm, 0, bcf);
     }
     assert(fm.get_future_map_domain().get_volume() == 1);
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index b32b69cd82..a5892fd59d 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -29,13 +29,13 @@ python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --
 
 # CPP test
 ../build/inference/peft/peft \
-    -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
+    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 4 \
     -ll:fsize 8192 -ll:zsize 12000 \
     -llm-model JackFram/llama-160m \
     -finetuning-dataset ../inference/prompt/peft_dataset.json \
     -peft-model goliaro/llama-160m-lora-full \
     --use-full-precision \
-    --inference-debugging \
     --fusion \
     -enable-peft
 

From 985c2548aef8f2257bf486307dd66909ba83d7de Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 1 May 2024 22:49:30 +0000
Subject: [PATCH 176/198] add peft tests to ci

---
 .github/workflows/gpu-ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 7bdb6805a8..b5260ead05 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -174,6 +174,9 @@ jobs:
           # Inference tests
           source ./build/set_python_envs.sh
           ./tests/inference_tests.sh
+
+          # PEFT tests
+          ./tests/peft_tests.sh
       
       - name: Save inference output as an artifact
         if: always()

From f033b4e1860dad8d904322be7aea8cac308d5a50 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 8 May 2024 19:07:30 +0000
Subject: [PATCH 177/198] shellcheck

---
 tests/peft_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index a5892fd59d..6e6147bbb0 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -14,7 +14,7 @@ fi
 # Create test prompt file
 mkdir -p ../inference/prompt
 echo '["Two things are infinite: "]' > ../inference/prompt/peft.json
-echo "[\"“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.“\"]" > ../inference/prompt/peft_dataset.json
+echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.“"]' > ../inference/prompt/peft_dataset.json
 
 
 # Create output folder

From 10119279e408d46c92c071bfee099d8035a7ea03 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 9 May 2024 22:19:57 +0000
Subject: [PATCH 178/198] fix

---
 src/runtime/file_loader.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index fd31f21b26..c373e0da9b 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -769,6 +769,10 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
       if (weight_filename != "embed_tokens_weight_lm_head") {
         weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
+      std::cout << "Loading weight file " << weight_filename << std::endl;
+      std::string weight_filepath =
+          join_path({weights_folder, weight_filename});
+      load_from_file(data, volume, weight_filepath);
     }
   }
 

From 9064c2ba40cde8b98fd765a6d5cf58df2754cd90 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 9 May 2024 22:22:46 +0000
Subject: [PATCH 179/198] fix python requirements

---
 conda/flexflow.yml                     | 1 +
 docker/flexflow-environment/Dockerfile | 2 +-
 requirements.txt                       | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index 89421db758..091ba929e4 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -30,4 +30,5 @@ dependencies:
     - datasets
     - accelerate
     - loralib
+    - triton
     - peft
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 84ee157302..fb4ea0ef75 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -94,7 +94,7 @@ RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
 # PEFT-related
-RUN pip3 install scipy bitsandbytes datasets accelerate loralib peft
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
diff --git a/requirements.txt b/requirements.txt
index 43df6a2975..f408ce7e06 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ bitsandbytes
 datasets 
 accelerate 
 loralib
+triton
 peft

From a125e86090ea09e3a10d607a6e485191eb5751eb Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 10 May 2024 00:19:58 +0000
Subject: [PATCH 180/198] fix

---
 src/ops/lora_linear.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 170e087226..95c60d2531 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -106,6 +106,10 @@ PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
                                       1 /*outputs*/,
                                       input,
                                       output);
+        // fix LoRA layer's transformer layer ID and model ID
+        peft_layer->layer_guid.transformer_layer_id =
+            target_module->layer_guid.transformer_layer_id;
+        peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
         {
           int numdims = output->num_dims;
           int dims[MAX_TENSOR_DIM];

From d74fe53ef66848003367ee5c7518875f96a77f80 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 11 May 2024 00:09:24 +0000
Subject: [PATCH 181/198] fix

---
 inference/python/ff_peft.py                   |   2 +-
 .../alignment/llama_alignment_tests.ipynb     | 560 +++++++++++++++---
 tests/peft/hf_finetune.py                     |   2 +-
 tests/peft_test.sh                            |  30 +-
 4 files changed, 500 insertions(+), 94 deletions(-)

diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index caf7ce1774..657748c6a9 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -65,7 +65,7 @@ def get_configs():
             # required parameters
             "base_model": "JackFram/llama-160m",
             "peft_model_ids": [
-                "goliaro/llama-160m-lora-full",
+                "goliaro/llama-160m-lora",
             ],
             # optional parameters
             "cache_path": "",
diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb
index 414280cff5..868dad18e3 100644
--- a/tests/peft/alignment/llama_alignment_tests.ipynb
+++ b/tests/peft/alignment/llama_alignment_tests.ipynb
@@ -15,6 +15,30 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/usr/FlexFlow/tests/peft/hf_peft_tensors /usr/FlexFlow/build/inference_tensors\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(hf_path, ff_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check weights (semi-automatically)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -81,7 +105,427 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "files_list = os.listdir(hf_path)\n",
+    "num_layers=12\n",
+    "for f in sorted(files_list):\n",
+    "    if f.endswith(\".weight\"):\n",
+    "        if \"self_attn\" in f:\n",
+    "            continue\n",
+    "        if f.endswith(\".lm_head.weight\"):\n",
+    "            f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n",
+    "        elif f == \"norm.weight\":\n",
+    "            f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n",
+    "        else:\n",
+    "            f_version = \"fwd_step_0_\"\n",
+    "            if f.startswith(\"layers.\"):\n",
+    "                layernum = f.split(\"layers.\")[1].split(\".\")[0]\n",
+    "                f_version += f\"layers_{layernum}_\"\n",
+    "            f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n",
+    "            weight_index=\"0\"\n",
+    "            if \"lora_A\" in f_version:\n",
+    "                weight_index=\"A\"\n",
+    "            elif \"lora_B\" in f_version:\n",
+    "                weight_index=\"B\"\n",
+    "            f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n",
+    "            f_version += f\"_shard_0_weight_{weight_index}\"\n",
+    "        # print(f, f_version)\n",
+    "        hf_w_path = os.path.join(hf_path, f)\n",
+    "        ff_w_path = os.path.join(ff_path, f_version)\n",
+    "        assert(os.path.isfile(hf_w_path))\n",
+    "        assert(os.path.isfile(ff_w_path))\n",
+    "        # print(\"\\t\", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))\n",
+    "        # print(\"\\t\", ff_w_path)\n",
+    "\n",
+    "        # check equivalence\n",
+    "        compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load model for automatic check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "use_full_precision=True\n",
+    "peft_model_id=\"goliaro/llama-160m-lora\"\n",
+    "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "if peft_config.peft_type != \"LORA\":\n",
+    "    raise ValueError(f\"PEFT type {peft_config.peft_type} not supported yet\")\n",
+    "\n",
+    "peft_config.init_lora_weights = (\n",
+    "    False\n",
+    ")  # prevent HF from re-inizialing the weights randomly\n",
+    "model_name = peft_config.base_model_name_or_path\n",
+    "# Load base model, and apply the PEFT layer\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    torch_dtype=torch.float32 if use_full_precision else torch.float16,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "embed_tokens\n",
+      "layers\n",
+      "layers.0\n",
+      "layers.0.self_attn\n",
+      "layers.0.self_attn.q_proj\n",
+      "layers.0.self_attn.k_proj\n",
+      "layers.0.self_attn.v_proj\n",
+      "layers.0.self_attn.o_proj\n",
+      "layers.0.self_attn.rotary_emb\n",
+      "layers.0.mlp\n",
+      "layers.0.mlp.gate_proj\n",
+      "layers.0.mlp.up_proj\n",
+      "layers.0.mlp.down_proj\n",
+      "layers.0.mlp.down_proj.base_layer\n",
+      "layers.0.mlp.down_proj.lora_dropout\n",
+      "layers.0.mlp.down_proj.lora_dropout.default\n",
+      "layers.0.mlp.down_proj.lora_A\n",
+      "layers.0.mlp.down_proj.lora_A.default\n",
+      "layers.0.mlp.down_proj.lora_B\n",
+      "layers.0.mlp.down_proj.lora_B.default\n",
+      "layers.0.mlp.down_proj.lora_embedding_A\n",
+      "layers.0.mlp.down_proj.lora_embedding_B\n",
+      "layers.0.mlp.act_fn\n",
+      "layers.0.input_layernorm\n",
+      "layers.0.post_attention_layernorm\n",
+      "layers.1\n",
+      "layers.1.self_attn\n",
+      "layers.1.self_attn.q_proj\n",
+      "layers.1.self_attn.k_proj\n",
+      "layers.1.self_attn.v_proj\n",
+      "layers.1.self_attn.o_proj\n",
+      "layers.1.self_attn.rotary_emb\n",
+      "layers.1.mlp\n",
+      "layers.1.mlp.gate_proj\n",
+      "layers.1.mlp.up_proj\n",
+      "layers.1.mlp.down_proj\n",
+      "layers.1.mlp.down_proj.base_layer\n",
+      "layers.1.mlp.down_proj.lora_dropout\n",
+      "layers.1.mlp.down_proj.lora_dropout.default\n",
+      "layers.1.mlp.down_proj.lora_A\n",
+      "layers.1.mlp.down_proj.lora_A.default\n",
+      "layers.1.mlp.down_proj.lora_B\n",
+      "layers.1.mlp.down_proj.lora_B.default\n",
+      "layers.1.mlp.down_proj.lora_embedding_A\n",
+      "layers.1.mlp.down_proj.lora_embedding_B\n",
+      "layers.1.mlp.act_fn\n",
+      "layers.1.input_layernorm\n",
+      "layers.1.post_attention_layernorm\n",
+      "layers.2\n",
+      "layers.2.self_attn\n",
+      "layers.2.self_attn.q_proj\n",
+      "layers.2.self_attn.k_proj\n",
+      "layers.2.self_attn.v_proj\n",
+      "layers.2.self_attn.o_proj\n",
+      "layers.2.self_attn.rotary_emb\n",
+      "layers.2.mlp\n",
+      "layers.2.mlp.gate_proj\n",
+      "layers.2.mlp.up_proj\n",
+      "layers.2.mlp.down_proj\n",
+      "layers.2.mlp.down_proj.base_layer\n",
+      "layers.2.mlp.down_proj.lora_dropout\n",
+      "layers.2.mlp.down_proj.lora_dropout.default\n",
+      "layers.2.mlp.down_proj.lora_A\n",
+      "layers.2.mlp.down_proj.lora_A.default\n",
+      "layers.2.mlp.down_proj.lora_B\n",
+      "layers.2.mlp.down_proj.lora_B.default\n",
+      "layers.2.mlp.down_proj.lora_embedding_A\n",
+      "layers.2.mlp.down_proj.lora_embedding_B\n",
+      "layers.2.mlp.act_fn\n",
+      "layers.2.input_layernorm\n",
+      "layers.2.post_attention_layernorm\n",
+      "layers.3\n",
+      "layers.3.self_attn\n",
+      "layers.3.self_attn.q_proj\n",
+      "layers.3.self_attn.k_proj\n",
+      "layers.3.self_attn.v_proj\n",
+      "layers.3.self_attn.o_proj\n",
+      "layers.3.self_attn.rotary_emb\n",
+      "layers.3.mlp\n",
+      "layers.3.mlp.gate_proj\n",
+      "layers.3.mlp.up_proj\n",
+      "layers.3.mlp.down_proj\n",
+      "layers.3.mlp.down_proj.base_layer\n",
+      "layers.3.mlp.down_proj.lora_dropout\n",
+      "layers.3.mlp.down_proj.lora_dropout.default\n",
+      "layers.3.mlp.down_proj.lora_A\n",
+      "layers.3.mlp.down_proj.lora_A.default\n",
+      "layers.3.mlp.down_proj.lora_B\n",
+      "layers.3.mlp.down_proj.lora_B.default\n",
+      "layers.3.mlp.down_proj.lora_embedding_A\n",
+      "layers.3.mlp.down_proj.lora_embedding_B\n",
+      "layers.3.mlp.act_fn\n",
+      "layers.3.input_layernorm\n",
+      "layers.3.post_attention_layernorm\n",
+      "layers.4\n",
+      "layers.4.self_attn\n",
+      "layers.4.self_attn.q_proj\n",
+      "layers.4.self_attn.k_proj\n",
+      "layers.4.self_attn.v_proj\n",
+      "layers.4.self_attn.o_proj\n",
+      "layers.4.self_attn.rotary_emb\n",
+      "layers.4.mlp\n",
+      "layers.4.mlp.gate_proj\n",
+      "layers.4.mlp.up_proj\n",
+      "layers.4.mlp.down_proj\n",
+      "layers.4.mlp.down_proj.base_layer\n",
+      "layers.4.mlp.down_proj.lora_dropout\n",
+      "layers.4.mlp.down_proj.lora_dropout.default\n",
+      "layers.4.mlp.down_proj.lora_A\n",
+      "layers.4.mlp.down_proj.lora_A.default\n",
+      "layers.4.mlp.down_proj.lora_B\n",
+      "layers.4.mlp.down_proj.lora_B.default\n",
+      "layers.4.mlp.down_proj.lora_embedding_A\n",
+      "layers.4.mlp.down_proj.lora_embedding_B\n",
+      "layers.4.mlp.act_fn\n",
+      "layers.4.input_layernorm\n",
+      "layers.4.post_attention_layernorm\n",
+      "layers.5\n",
+      "layers.5.self_attn\n",
+      "layers.5.self_attn.q_proj\n",
+      "layers.5.self_attn.k_proj\n",
+      "layers.5.self_attn.v_proj\n",
+      "layers.5.self_attn.o_proj\n",
+      "layers.5.self_attn.rotary_emb\n",
+      "layers.5.mlp\n",
+      "layers.5.mlp.gate_proj\n",
+      "layers.5.mlp.up_proj\n",
+      "layers.5.mlp.down_proj\n",
+      "layers.5.mlp.down_proj.base_layer\n",
+      "layers.5.mlp.down_proj.lora_dropout\n",
+      "layers.5.mlp.down_proj.lora_dropout.default\n",
+      "layers.5.mlp.down_proj.lora_A\n",
+      "layers.5.mlp.down_proj.lora_A.default\n",
+      "layers.5.mlp.down_proj.lora_B\n",
+      "layers.5.mlp.down_proj.lora_B.default\n",
+      "layers.5.mlp.down_proj.lora_embedding_A\n",
+      "layers.5.mlp.down_proj.lora_embedding_B\n",
+      "layers.5.mlp.act_fn\n",
+      "layers.5.input_layernorm\n",
+      "layers.5.post_attention_layernorm\n",
+      "layers.6\n",
+      "layers.6.self_attn\n",
+      "layers.6.self_attn.q_proj\n",
+      "layers.6.self_attn.k_proj\n",
+      "layers.6.self_attn.v_proj\n",
+      "layers.6.self_attn.o_proj\n",
+      "layers.6.self_attn.rotary_emb\n",
+      "layers.6.mlp\n",
+      "layers.6.mlp.gate_proj\n",
+      "layers.6.mlp.up_proj\n",
+      "layers.6.mlp.down_proj\n",
+      "layers.6.mlp.down_proj.base_layer\n",
+      "layers.6.mlp.down_proj.lora_dropout\n",
+      "layers.6.mlp.down_proj.lora_dropout.default\n",
+      "layers.6.mlp.down_proj.lora_A\n",
+      "layers.6.mlp.down_proj.lora_A.default\n",
+      "layers.6.mlp.down_proj.lora_B\n",
+      "layers.6.mlp.down_proj.lora_B.default\n",
+      "layers.6.mlp.down_proj.lora_embedding_A\n",
+      "layers.6.mlp.down_proj.lora_embedding_B\n",
+      "layers.6.mlp.act_fn\n",
+      "layers.6.input_layernorm\n",
+      "layers.6.post_attention_layernorm\n",
+      "layers.7\n",
+      "layers.7.self_attn\n",
+      "layers.7.self_attn.q_proj\n",
+      "layers.7.self_attn.k_proj\n",
+      "layers.7.self_attn.v_proj\n",
+      "layers.7.self_attn.o_proj\n",
+      "layers.7.self_attn.rotary_emb\n",
+      "layers.7.mlp\n",
+      "layers.7.mlp.gate_proj\n",
+      "layers.7.mlp.up_proj\n",
+      "layers.7.mlp.down_proj\n",
+      "layers.7.mlp.down_proj.base_layer\n",
+      "layers.7.mlp.down_proj.lora_dropout\n",
+      "layers.7.mlp.down_proj.lora_dropout.default\n",
+      "layers.7.mlp.down_proj.lora_A\n",
+      "layers.7.mlp.down_proj.lora_A.default\n",
+      "layers.7.mlp.down_proj.lora_B\n",
+      "layers.7.mlp.down_proj.lora_B.default\n",
+      "layers.7.mlp.down_proj.lora_embedding_A\n",
+      "layers.7.mlp.down_proj.lora_embedding_B\n",
+      "layers.7.mlp.act_fn\n",
+      "layers.7.input_layernorm\n",
+      "layers.7.post_attention_layernorm\n",
+      "layers.8\n",
+      "layers.8.self_attn\n",
+      "layers.8.self_attn.q_proj\n",
+      "layers.8.self_attn.k_proj\n",
+      "layers.8.self_attn.v_proj\n",
+      "layers.8.self_attn.o_proj\n",
+      "layers.8.self_attn.rotary_emb\n",
+      "layers.8.mlp\n",
+      "layers.8.mlp.gate_proj\n",
+      "layers.8.mlp.up_proj\n",
+      "layers.8.mlp.down_proj\n",
+      "layers.8.mlp.down_proj.base_layer\n",
+      "layers.8.mlp.down_proj.lora_dropout\n",
+      "layers.8.mlp.down_proj.lora_dropout.default\n",
+      "layers.8.mlp.down_proj.lora_A\n",
+      "layers.8.mlp.down_proj.lora_A.default\n",
+      "layers.8.mlp.down_proj.lora_B\n",
+      "layers.8.mlp.down_proj.lora_B.default\n",
+      "layers.8.mlp.down_proj.lora_embedding_A\n",
+      "layers.8.mlp.down_proj.lora_embedding_B\n",
+      "layers.8.mlp.act_fn\n",
+      "layers.8.input_layernorm\n",
+      "layers.8.post_attention_layernorm\n",
+      "layers.9\n",
+      "layers.9.self_attn\n",
+      "layers.9.self_attn.q_proj\n",
+      "layers.9.self_attn.k_proj\n",
+      "layers.9.self_attn.v_proj\n",
+      "layers.9.self_attn.o_proj\n",
+      "layers.9.self_attn.rotary_emb\n",
+      "layers.9.mlp\n",
+      "layers.9.mlp.gate_proj\n",
+      "layers.9.mlp.up_proj\n",
+      "layers.9.mlp.down_proj\n",
+      "layers.9.mlp.down_proj.base_layer\n",
+      "layers.9.mlp.down_proj.lora_dropout\n",
+      "layers.9.mlp.down_proj.lora_dropout.default\n",
+      "layers.9.mlp.down_proj.lora_A\n",
+      "layers.9.mlp.down_proj.lora_A.default\n",
+      "layers.9.mlp.down_proj.lora_B\n",
+      "layers.9.mlp.down_proj.lora_B.default\n",
+      "layers.9.mlp.down_proj.lora_embedding_A\n",
+      "layers.9.mlp.down_proj.lora_embedding_B\n",
+      "layers.9.mlp.act_fn\n",
+      "layers.9.input_layernorm\n",
+      "layers.9.post_attention_layernorm\n",
+      "layers.10\n",
+      "layers.10.self_attn\n",
+      "layers.10.self_attn.q_proj\n",
+      "layers.10.self_attn.k_proj\n",
+      "layers.10.self_attn.v_proj\n",
+      "layers.10.self_attn.o_proj\n",
+      "layers.10.self_attn.rotary_emb\n",
+      "layers.10.mlp\n",
+      "layers.10.mlp.gate_proj\n",
+      "layers.10.mlp.up_proj\n",
+      "layers.10.mlp.down_proj\n",
+      "layers.10.mlp.down_proj.base_layer\n",
+      "layers.10.mlp.down_proj.lora_dropout\n",
+      "layers.10.mlp.down_proj.lora_dropout.default\n",
+      "layers.10.mlp.down_proj.lora_A\n",
+      "layers.10.mlp.down_proj.lora_A.default\n",
+      "layers.10.mlp.down_proj.lora_B\n",
+      "layers.10.mlp.down_proj.lora_B.default\n",
+      "layers.10.mlp.down_proj.lora_embedding_A\n",
+      "layers.10.mlp.down_proj.lora_embedding_B\n",
+      "layers.10.mlp.act_fn\n",
+      "layers.10.input_layernorm\n",
+      "layers.10.post_attention_layernorm\n",
+      "layers.11\n",
+      "layers.11.self_attn\n",
+      "layers.11.self_attn.q_proj\n",
+      "layers.11.self_attn.k_proj\n",
+      "layers.11.self_attn.v_proj\n",
+      "layers.11.self_attn.o_proj\n",
+      "layers.11.self_attn.rotary_emb\n",
+      "layers.11.mlp\n",
+      "layers.11.mlp.gate_proj\n",
+      "layers.11.mlp.up_proj\n",
+      "layers.11.mlp.down_proj\n",
+      "layers.11.mlp.down_proj.base_layer\n",
+      "layers.11.mlp.down_proj.lora_dropout\n",
+      "layers.11.mlp.down_proj.lora_dropout.default\n",
+      "layers.11.mlp.down_proj.lora_A\n",
+      "layers.11.mlp.down_proj.lora_A.default\n",
+      "layers.11.mlp.down_proj.lora_B\n",
+      "layers.11.mlp.down_proj.lora_B.default\n",
+      "layers.11.mlp.down_proj.lora_embedding_A\n",
+      "layers.11.mlp.down_proj.lora_embedding_B\n",
+      "layers.11.mlp.act_fn\n",
+      "layers.11.input_layernorm\n",
+      "layers.11.post_attention_layernorm\n",
+      "norm\n"
+     ]
+    }
+   ],
+   "source": [
+    "named_modules = [name.replace(\"base_model.model.model.\", \"\") for name, _ in model.named_modules() if \"base_model.model.model.\" in name]\n",
+    "for x in named_modules:\n",
+    "    print(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manual check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "hf_embed_input= \"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.input_0\"\n",
+    "ff_embed_input=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n",
+    "compare_tensors(hf_embed_input, ff_embed_input)\n",
+    "hf_embed_output=\"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.output_0\"\n",
+    "ff_embed_output=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n",
+    "compare_tensors(hf_embed_output, ff_embed_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
@@ -91,116 +535,58 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n"
+      "/usr/FlexFlow/tests/peft/hf_peft_tensors/layers.0.mlp.down_proj.lora_A.default.weight True\n",
+      "/usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers_0_feed_forward_w2_lora_shard_0_weight_A False\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[20], line 40\u001b[0m\n\u001b[1;32m     38\u001b[0m hf_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     39\u001b[0m ff_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_A\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 40\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_lora_A_weight_fp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_lora_A_weight_fp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     41\u001b[0m hf_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_B.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     42\u001b[0m ff_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_B\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m/usr/FlexFlow/tests/peft/alignment/align_test_utils.py:24\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28mprint\u001b[39m(hf_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(hf_tensor_filepath))\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28mprint\u001b[39m(ff_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(ff_tensor_filepath))\n\u001b[0;32m---> 24\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m     25\u001b[0m hf_tensor \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_tensor_filepath)\n\u001b[1;32m     26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
      ]
     }
    ],
    "source": [
     "tot_num_layers = 12\n",
     "for i in range(tot_num_layers):\n",
+    "    hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n",
+    "    ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n",
+    "    compare_tensors(hf_input_ln_in, ff_input_ln_in)\n",
     "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n",
-    "    ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_RMSNorm_shard_0_output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
     "    if i > 0:\n",
     "        ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_norm_shard_0_output_1\"\n",
     "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
     "    hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n",
-    "    ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n",
+    "    ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n",
     "    compare_tensors(hf_attn_out, ff_attn_out)\n",
     "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n",
-    "    ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_1\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n",
     "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
     "    # w1\n",
     "    hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
-    "    ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n",
     "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n",
     "    # w3\n",
     "    hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n",
-    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_0_layers.0.mlp.up_proj_shard_0_output_0\"\n",
     "    compare_tensors(hf_up_proj_out, ff_up_proj_out)\n",
     "    # w2\n",
     "    hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n",
     "    hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n",
-    "    ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n",
-    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n",
+    "    ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n",
     "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
     "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
     "    # LORA input\n",
     "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n",
-    "    ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0\"\n",
     "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
     "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
     "    # LORA weights\n",
@@ -234,7 +620,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -292,7 +678,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -375,7 +761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -2031,7 +2417,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 1e0e0bd167..cccb7cf11c 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -151,7 +151,7 @@ def peft_forward_hook(module, input, output):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--peft-model-id", type=str, default="goliaro/llama-160m-lora-full"
+        "--peft-model-id", type=str, default="goliaro/llama-160m-lora"
     )
     parser.add_argument("--lora-alpha", type=int, default=16)
     parser.add_argument("--lora-dropout", type=float, default=0.0)
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 6e6147bbb0..bf9ca816e7 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -14,17 +14,17 @@ fi
 # Create test prompt file
 mkdir -p ../inference/prompt
 echo '["Two things are infinite: "]' > ../inference/prompt/peft.json
-echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.“"]' > ../inference/prompt/peft_dataset.json
+echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.”"]' > ../inference/prompt/peft_dataset.json
 
 
 # Create output folder
 mkdir -p ../inference/output
 
 # Enable backtrace in case we run into a segfault or assertion failure
-export LEGION_BACKTRACE=1
+# export LEGION_BACKTRACE=1
 
 # Download test model
-python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m 
+python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
 # if first time, add: --refresh-cache
 
 # CPP test
@@ -34,10 +34,30 @@ python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --
     -ll:fsize 8192 -ll:zsize 12000 \
     -llm-model JackFram/llama-160m \
     -finetuning-dataset ../inference/prompt/peft_dataset.json \
-    -peft-model goliaro/llama-160m-lora-full \
+    -peft-model goliaro/llama-160m-lora \
     --use-full-precision \
     --fusion \
     -enable-peft
 
-# Python test
+Python test
 python ../inference/python/ff_peft.py
+
+# cd ../build
+# rm -rf inference_tensors || true
+# ./inference/peft/peft \
+#     -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
+#     -tensor-parallelism-degree 1 \
+#     -ll:fsize 8192 -ll:zsize 12000 \
+#     -llm-model JackFram/llama-160m \
+#     -finetuning-dataset ../inference/prompt/peft_dataset.json \
+#     -peft-model goliaro/llama-160m-lora \
+#     -enable-peft \
+#     --use-full-precision \
+#     --inference-debugging
+# rm -rf inference_tensors/bwd_*
+
+# cd ../tests/peft
+# rm -rf hf_peft_tensors || true
+# python hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
+# rm -rf hf_peft_tensors/bwd_*
+

From 0c6ae097bf2d61e508c1d13e777271b292795a74 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 17 May 2024 20:30:14 +0000
Subject: [PATCH 182/198] update ci test

---
 .gitignore                               |   1 +
 tests/.gitignore                         |   1 -
 tests/peft/alignment/align_test_utils.py |   8 +-
 tests/peft/fine_tune.sh                  |  19 ---
 tests/peft/peft_alignment_test.py        | 158 +++++++++++++++++++++++
 tests/peft_test.sh                       |  52 ++++----
 6 files changed, 190 insertions(+), 49 deletions(-)
 delete mode 100644 tests/.gitignore
 delete mode 100755 tests/peft/fine_tune.sh
 create mode 100644 tests/peft/peft_alignment_test.py

diff --git a/.gitignore b/.gitignore
index 0642faa000..cc34c1a7b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -188,6 +188,7 @@ python/flexflow/version.txt
 
 inference_tensors
 hf_peft_tensors
+lora_training_logs
 
 Untitled-1.ipynb
 Untitled-2.ipynb
diff --git a/tests/.gitignore b/tests/.gitignore
deleted file mode 100644
index f3732d54f4..0000000000
--- a/tests/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-inference/python_test_configs/*.json
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index dbe7a0be40..24da900fcb 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -18,10 +18,10 @@ def print_unique_files_list(dirname):
                     files_list.remove(f)
     return sorted(files_list)
 def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):
-    if not (os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)):
-        print(hf_tensor_filepath, os.path.exists(hf_tensor_filepath))
-        print(ff_tensor_filepath, os.path.exists(ff_tensor_filepath))
-        assert False
+    if not os.path.exists(hf_tensor_filepath):
+        raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found")
+    if not os.path.exists(ff_tensor_filepath):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor_filepath} not found")
     hf_tensor = torch.load(hf_tensor_filepath)
     if type(hf_tensor) == tuple or type(hf_tensor) == list:
         assert(len(hf_tensor) == 1)
diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh
deleted file mode 100755
index 309d87130a..0000000000
--- a/tests/peft/fine_tune.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#! /usr/bin/env bash
-set -e
-set -x
-
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full
-python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half
-python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
-python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
-
-python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full
-python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half
-
-python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full
-python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half
-python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full
-python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
new file mode 100644
index 0000000000..f07c65140b
--- /dev/null
+++ b/tests/peft/peft_alignment_test.py
@@ -0,0 +1,158 @@
+import numpy as np
+import os, torch
+from alignment.align_test_utils import *
+
+def convert_hf_filename_to_ff_filename(f, num_layers=12):
+    if f.endswith(".lm_head.weight"):
+        f_version = f"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0"
+    elif f == "norm.weight":
+        f_version = f"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0"
+    else:
+        f_version = "fwd_step_0_"
+        if f.startswith("layers."):
+            layernum = f.split("layers.")[1].split(".")[0]
+            f_version += f"layers_{layernum}_"
+        f_version += f.split(".weight")[0].replace(".base_layer", "").replace(".default", "")
+        weight_index="0"
+        if "lora_A" in f_version:
+            weight_index="A"
+        elif "lora_B" in f_version:
+            weight_index="B"
+        f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+        f_version += f"_shard_0_weight_{weight_index}"
+    return f_version
+
+def check_weights_alignment():
+    print("-- Weights alignment --")
+    files_list = os.listdir(hf_path)
+    num_layers=12
+    for f in sorted(files_list):
+        if f.endswith(".weight"):
+            if "self_attn" in f:
+                continue
+            f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)
+            # print(f, f_version)
+            hf_w_path = os.path.join(hf_path, f)
+            ff_w_path = os.path.join(ff_path, f_version)
+            assert(os.path.isfile(hf_w_path))
+            assert(os.path.isfile(ff_w_path))
+            # print("\t", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))
+            # print("\t", ff_w_path)
+
+            # check equivalence
+            compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)
+
+def check_fwd_pass(tot_num_layers = 12):
+    print("-- FWD pass --")
+    # Transfomer head
+    hf_embed_input= f"{hf_path}/fwd_step_0_embed_tokens.input_0"
+    ff_embed_input = f"{ff_path}/fwd_step_0_layers_0_embed_tokens_shard_0_input_0"
+    compare_tensors(hf_embed_input, ff_embed_input)
+    hf_embed_output = f"{hf_path}/fwd_step_0_embed_tokens.output_0"
+    ff_embed_output = f"{ff_path}/fwd_step_0_layers_0_embed_tokens_shard_0_output_0"
+    compare_tensors(hf_embed_output, ff_embed_output)
+
+    # Transformers blocks
+    for i in range(tot_num_layers):
+        hf_input_ln_in = f"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0"
+        ff_input_ln_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0"
+        if i > 0:
+            ff_input_ln_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0"
+        compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)
+        hf_input_ln_out = f"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0"
+        ff_input_ln_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0"
+        if i > 0:
+            ff_input_ln_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1"
+        compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)
+        hf_attn_out = f"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0"
+        ff_attn_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0"
+        compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)
+        hf_ffn_norm_out = f"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0"
+        ff_ffn_norm_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1"
+        compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)
+        # w1
+        hf_gate_proj_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0"
+        ff_gate_proj_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0"
+        compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)
+        # w3
+        hf_up_proj_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0" 
+        ff_up_proj_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0"
+        compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)
+        # w2
+        hf_down_proj_in = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0"
+        hf_down_proj_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0"
+        ff_down_proj_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0"
+        ff_down_proj_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0"
+        compare_tensors(hf_down_proj_in, ff_down_proj_in)
+        # compare_tensors(hf_down_proj_out, ff_down_proj_out)
+        # LORA input
+        hf_lora_A_in = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0"
+        ff_lora_A_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0"
+        compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)
+        compare_tensors(hf_lora_A_in, ff_lora_A_in)
+        # LORA weights
+        hf_lora_A_weight_fp = f"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight"
+        ff_lora_A_weight_fp = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A"
+        compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)
+        hf_lora_B_weight_fp = f"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight"
+        ff_lora_B_weight_fp = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B"
+        compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)
+        # LORA intermediate hf
+        hf_lora_A_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0"
+        hf_lora_B_in = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0"
+        compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)
+        # LORA output
+        hf_lora_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0"
+        ff_lora_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0"
+        # compare_tensors(hf_lora_out, ff_lora_out)
+        # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)
+        # compare_tensors(hf_down_proj_out, ff_lora_out)
+        compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)
+        
+
+    # After last layer only
+    hf_norm_out = f"{hf_path}/fwd_step_0_norm.output_0"
+    ff_norm_out = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1"
+    compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)
+    hf_lm_head_out = f"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0"
+    ff_lm_head_out = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0"
+    compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)
+
+def check_bwd_pass(tot_num_layers = 12):
+    # ff_BWD_softmax_in = f"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0"
+    print("-- LM head --")
+    hf_BWD_lm_head_out = f"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0"
+    ff_BWD_lm_head_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0"
+    compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)
+    # compare weights
+    hf_lm_head_weight = f"{hf_path}/base_model.model.lm_head.weight"
+    ff_lm_head_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0"
+    compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)
+    hf_BWD_lm_head_in = f"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0"
+    ff_BWD_lm_head_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0"
+    compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)
+    # # Manually check the matmul
+    # ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')
+    # ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')
+    # ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')
+    # print(ff_tensor_out.shape)
+    # print(ff_weight.shape)
+    # print(np.matmul(ff_weight, ff_tensor_out))
+    # compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)
+    # ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')
+    print("-- Final Norm --")
+    hf_BWD_norm_out = f"{hf_path}/bwd_step_0_norm.go_0"
+    ff_BWD_norm_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0"
+    compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)
+    compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)
+    ff_BWD_norm_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0"
+    hf_FWD_norm_weight = f"{hf_path}/norm.weight"
+    compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)
+    hf_BWD_norm_in = f"{hf_path}/bwd_step_0_norm.gi_0"
+    ff_BWD_norm_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1"
+    compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)
+
+if __name__ == "__main__":
+    check_weights_alignment()
+    check_fwd_pass()
+    check_bwd_pass()
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index bf9ca816e7..219b82342a 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -27,37 +27,39 @@ mkdir -p ../inference/output
 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
 # if first time, add: --refresh-cache
 
-# CPP test
-../build/inference/peft/peft \
-    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
-    -tensor-parallelism-degree 4 \
-    -ll:fsize 8192 -ll:zsize 12000 \
-    -llm-model JackFram/llama-160m \
-    -finetuning-dataset ../inference/prompt/peft_dataset.json \
-    -peft-model goliaro/llama-160m-lora \
-    --use-full-precision \
-    --fusion \
-    -enable-peft
-
-Python test
-python ../inference/python/ff_peft.py
-
-# cd ../build
-# rm -rf inference_tensors || true
-# ./inference/peft/peft \
-#     -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
-#     -tensor-parallelism-degree 1 \
+# # CPP test
+# ../build/inference/peft/peft \
+#     -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+#     -tensor-parallelism-degree 4 \
 #     -ll:fsize 8192 -ll:zsize 12000 \
 #     -llm-model JackFram/llama-160m \
 #     -finetuning-dataset ../inference/prompt/peft_dataset.json \
 #     -peft-model goliaro/llama-160m-lora \
-#     -enable-peft \
 #     --use-full-precision \
-#     --inference-debugging
+#     --fusion \
+#     -enable-peft
+
+# # Python test
+# python ../inference/python/ff_peft.py
+
+cd ../build
+rm -rf inference_tensors || true
+./inference/peft/peft \
+    -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 1 \
+    -ll:fsize 8192 -ll:zsize 12000 \
+    -llm-model JackFram/llama-160m \
+    -finetuning-dataset ../inference/prompt/peft_dataset.json \
+    -peft-model goliaro/llama-160m-lora \
+    -enable-peft \
+    --use-full-precision \
+    --inference-debugging
 # rm -rf inference_tensors/bwd_*
 
-# cd ../tests/peft
-# rm -rf hf_peft_tensors || true
-# python hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
+cd ../tests/peft
+rm -rf hf_peft_tensors || true
+python hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
 # rm -rf hf_peft_tensors/bwd_*
 
+
+python peft_alignment_test.py

From 93b6032b29f92e3be42aafea3f822722f6fbbeb4 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 17 May 2024 20:30:51 +0000
Subject: [PATCH 183/198] update alignment doc

---
 .../alignment/llama_alignment_tests.ipynb     | 892 +++++++++++-------
 1 file changed, 559 insertions(+), 333 deletions(-)

diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb
index 868dad18e3..86a4ef76c4 100644
--- a/tests/peft/alignment/llama_alignment_tests.ipynb
+++ b/tests/peft/alignment/llama_alignment_tests.ipynb
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -110,29 +110,33 @@
     }
    ],
    "source": [
+    "def convert_hf_filename_to_ff_filename(f, num_layers=12):\n",
+    "    if f.endswith(\".lm_head.weight\"):\n",
+    "        f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n",
+    "    elif f == \"norm.weight\":\n",
+    "        f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n",
+    "    else:\n",
+    "        f_version = \"fwd_step_0_\"\n",
+    "        if f.startswith(\"layers.\"):\n",
+    "            layernum = f.split(\"layers.\")[1].split(\".\")[0]\n",
+    "            f_version += f\"layers_{layernum}_\"\n",
+    "        f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n",
+    "        weight_index=\"0\"\n",
+    "        if \"lora_A\" in f_version:\n",
+    "            weight_index=\"A\"\n",
+    "        elif \"lora_B\" in f_version:\n",
+    "            weight_index=\"B\"\n",
+    "        f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n",
+    "        f_version += f\"_shard_0_weight_{weight_index}\"\n",
+    "    return f_version\n",
+    "\n",
     "files_list = os.listdir(hf_path)\n",
     "num_layers=12\n",
     "for f in sorted(files_list):\n",
     "    if f.endswith(\".weight\"):\n",
     "        if \"self_attn\" in f:\n",
     "            continue\n",
-    "        if f.endswith(\".lm_head.weight\"):\n",
-    "            f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n",
-    "        elif f == \"norm.weight\":\n",
-    "            f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n",
-    "        else:\n",
-    "            f_version = \"fwd_step_0_\"\n",
-    "            if f.startswith(\"layers.\"):\n",
-    "                layernum = f.split(\"layers.\")[1].split(\".\")[0]\n",
-    "                f_version += f\"layers_{layernum}_\"\n",
-    "            f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n",
-    "            weight_index=\"0\"\n",
-    "            if \"lora_A\" in f_version:\n",
-    "                weight_index=\"A\"\n",
-    "            elif \"lora_B\" in f_version:\n",
-    "                weight_index=\"B\"\n",
-    "            f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n",
-    "            f_version += f\"_shard_0_weight_{weight_index}\"\n",
+    "        f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)\n",
     "        # print(f, f_version)\n",
     "        hf_w_path = os.path.join(hf_path, f)\n",
     "        ff_w_path = os.path.join(ff_path, f_version)\n",
@@ -154,7 +158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -192,299 +196,369 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "embed_tokens True True\n",
+      "layers.0.self_attn.q_proj True True\n",
+      "layers.0.self_attn.k_proj True True\n",
+      "layers.0.self_attn.v_proj True True\n",
+      "layers.0.self_attn.o_proj True True\n",
+      "layers.0.self_attn.rotary_emb True True\n",
+      "layers.0.mlp.gate_proj True True\n",
+      "layers.0.mlp.up_proj True True\n",
+      "layers.0.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.0.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_output_0\n",
+      "layers.0.input_layernorm True True\n",
+      "layers.0.post_attention_layernorm True True\n",
+      "layers.1.self_attn.q_proj True True\n",
+      "layers.1.self_attn.k_proj True True\n",
+      "layers.1.self_attn.v_proj True True\n",
+      "layers.1.self_attn.o_proj True True\n",
+      "layers.1.self_attn.rotary_emb True True\n",
+      "layers.1.mlp.gate_proj True True\n",
+      "layers.1.mlp.up_proj True True\n",
+      "layers.1.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.1.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_output_0\n",
+      "layers.1.input_layernorm True True\n",
+      "layers.1.post_attention_layernorm True True\n",
+      "layers.2.self_attn.q_proj True True\n",
+      "layers.2.self_attn.k_proj True True\n",
+      "layers.2.self_attn.v_proj True True\n",
+      "layers.2.self_attn.o_proj True True\n",
+      "layers.2.self_attn.rotary_emb True True\n",
+      "layers.2.mlp.gate_proj True True\n",
+      "layers.2.mlp.up_proj True True\n",
+      "layers.2.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.2.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_output_0\n",
+      "layers.2.input_layernorm True True\n",
+      "layers.2.post_attention_layernorm True True\n",
+      "layers.3.self_attn.q_proj True True\n",
+      "layers.3.self_attn.k_proj True True\n",
+      "layers.3.self_attn.v_proj True True\n",
+      "layers.3.self_attn.o_proj True True\n",
+      "layers.3.self_attn.rotary_emb True True\n",
+      "layers.3.mlp.gate_proj True True\n",
+      "layers.3.mlp.up_proj True True\n",
+      "layers.3.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.3.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_output_0\n",
+      "layers.3.input_layernorm True True\n",
+      "layers.3.post_attention_layernorm True True\n",
+      "layers.4.self_attn.q_proj True True\n",
+      "layers.4.self_attn.k_proj True True\n",
+      "layers.4.self_attn.v_proj True True\n",
+      "layers.4.self_attn.o_proj True True\n",
+      "layers.4.self_attn.rotary_emb True True\n",
+      "layers.4.mlp.gate_proj True True\n",
+      "layers.4.mlp.up_proj True True\n",
+      "layers.4.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.4.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_output_0\n",
+      "layers.4.input_layernorm True True\n",
+      "layers.4.post_attention_layernorm True True\n",
+      "layers.5.self_attn.q_proj True True\n",
+      "layers.5.self_attn.k_proj True True\n",
+      "layers.5.self_attn.v_proj True True\n",
+      "layers.5.self_attn.o_proj True True\n",
+      "layers.5.self_attn.rotary_emb True True\n",
+      "layers.5.mlp.gate_proj True True\n",
+      "layers.5.mlp.up_proj True True\n",
+      "layers.5.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.5.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_output_0\n",
+      "layers.5.input_layernorm True True\n",
+      "layers.5.post_attention_layernorm True True\n",
+      "layers.6.self_attn.q_proj True True\n",
+      "layers.6.self_attn.k_proj True True\n",
+      "layers.6.self_attn.v_proj True True\n",
+      "layers.6.self_attn.o_proj True True\n",
+      "layers.6.self_attn.rotary_emb True True\n",
+      "layers.6.mlp.gate_proj True True\n",
+      "layers.6.mlp.up_proj True True\n",
+      "layers.6.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.6.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_output_0\n",
+      "layers.6.input_layernorm True True\n",
+      "layers.6.post_attention_layernorm True True\n",
+      "layers.7.self_attn.q_proj True True\n",
+      "layers.7.self_attn.k_proj True True\n",
+      "layers.7.self_attn.v_proj True True\n",
+      "layers.7.self_attn.o_proj True True\n",
+      "layers.7.self_attn.rotary_emb True True\n",
+      "layers.7.mlp.gate_proj True True\n",
+      "layers.7.mlp.up_proj True True\n",
+      "layers.7.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.7.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_output_0\n",
+      "layers.7.input_layernorm True True\n",
+      "layers.7.post_attention_layernorm True True\n",
+      "layers.8.self_attn.q_proj True True\n",
+      "layers.8.self_attn.k_proj True True\n",
+      "layers.8.self_attn.v_proj True True\n",
+      "layers.8.self_attn.o_proj True True\n",
+      "layers.8.self_attn.rotary_emb True True\n",
+      "layers.8.mlp.gate_proj True True\n",
+      "layers.8.mlp.up_proj True True\n",
+      "layers.8.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.8.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_output_0\n",
+      "layers.8.input_layernorm True True\n",
+      "layers.8.post_attention_layernorm True True\n",
+      "layers.9.self_attn.q_proj True True\n",
+      "layers.9.self_attn.k_proj True True\n",
+      "layers.9.self_attn.v_proj True True\n",
+      "layers.9.self_attn.o_proj True True\n",
+      "layers.9.self_attn.rotary_emb True True\n",
+      "layers.9.mlp.gate_proj True True\n",
+      "layers.9.mlp.up_proj True True\n",
+      "layers.9.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.9.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_output_0\n",
+      "layers.9.input_layernorm True True\n",
+      "layers.9.post_attention_layernorm True True\n",
+      "layers.10.self_attn.q_proj True True\n",
+      "layers.10.self_attn.k_proj True True\n",
+      "layers.10.self_attn.v_proj True True\n",
+      "layers.10.self_attn.o_proj True True\n",
+      "layers.10.self_attn.rotary_emb True True\n",
+      "layers.10.mlp.gate_proj True True\n",
+      "layers.10.mlp.up_proj True True\n",
+      "layers.10.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.10.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_output_0\n",
+      "layers.10.input_layernorm True True\n",
+      "layers.10.post_attention_layernorm True True\n",
+      "layers.11.self_attn.q_proj True True\n",
+      "layers.11.self_attn.k_proj True True\n",
+      "layers.11.self_attn.v_proj True True\n",
+      "layers.11.self_attn.o_proj True True\n",
+      "layers.11.self_attn.rotary_emb True True\n",
+      "layers.11.mlp.gate_proj True True\n",
+      "layers.11.mlp.up_proj True True\n",
+      "layers.11.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.11.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_output_0\n",
+      "layers.11.input_layernorm True True\n",
+      "layers.11.post_attention_layernorm True True\n",
+      "norm True True\n",
+      "lm_head True True\n"
+     ]
+    }
+   ],
+   "source": [
+    "named_modules_ = [\n",
+    "    name.replace(\"base_model.model.model.\", \"\").replace(\"base_model.model.model\", \"\").replace(\"base_model.model.\", \"\").replace(\"base_model.model\", \"\").replace(\"base_model.\", \"\").replace(\"base_model\", \"\")\n",
+    "    for name, _ in model.named_modules()\n",
+    "]\n",
+    "\n",
+    "def remove_prefixes(named_modules):\n",
+    "    i = 0\n",
+    "    while i < len(named_modules) - 1:\n",
+    "        if named_modules[i + 1].startswith(named_modules[i]):\n",
+    "            named_modules.pop(i)\n",
+    "        else:\n",
+    "            i += 1\n",
+    "    return named_modules\n",
+    "named_modules = remove_prefixes(named_modules_)\n",
+    "\n",
+    "def convert_hf_module_name_to_ff_filenames(n, num_layers=12):\n",
+    "    if n == \"embed_tokens\":\n",
+    "        ff_in_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n",
+    "        ff_out_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n",
+    "    elif n == \"lm_head\" or n == \"norm\":\n",
+    "        ff_in_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_input_0\"\n",
+    "        ff_out_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_output_0\"\n",
+    "    elif n.startswith(\"layers.\"):\n",
+    "        layernum = n.split(\"layers.\")[1].split(\".\")[0]\n",
+    "        ff_in_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_input_0\"\n",
+    "        ff_out_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_output_0\"\n",
+    "    else:\n",
+    "        assert False, f\"Module {n} not supported yet\"\n",
+    "    return os.path.join(ff_path, ff_in_name), os.path.join(ff_path, ff_out_name)\n",
+    "\n",
+    "# Compute the hf path, check if the input and output are there\n",
+    "for n in named_modules:\n",
+    "    in_name = f\"fwd_step_0_{n}.input_0\"\n",
+    "    out_name = f\"fwd_step_0_{n}.output_0\"\n",
+    "    if n == \"lm_head\":\n",
+    "        in_name = f\"fwd_step_0_base_model.model.{n}.input_0\"\n",
+    "        out_name = f\"fwd_step_0_base_model.model.{n}.output_0\"\n",
+    "    hf_mod_in = os.path.join(hf_path, in_name)\n",
+    "    hf_mod_out = os.path.join(hf_path, out_name)\n",
+    "    check = os.path.exists(hf_mod_in) and os.path.exists(hf_mod_out)\n",
+    "    \n",
+    "    check2=True\n",
+    "    if \"self_attn\" not in n:\n",
+    "        ff_mod_in, ff_mod_out = convert_hf_module_name_to_ff_filenames(n, num_layers=num_layers)\n",
+    "        check2 = os.path.exists(ff_mod_in) and os.path.exists(ff_mod_out)\n",
+    "    print(n, check, check2)\n",
+    "    if not check2:\n",
+    "        print(\"\\t\", ff_mod_in, ff_mod_out)\n",
+    "    # print(n, check)\n",
+    "    # print(\"\\t\", )\n",
+    "    \n",
+    "\n",
+    "# Compute the corresponding ff path, check if the input and output are there\n",
+    "\n",
+    "# for x in named_modules:\n",
+    "#     print(x)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "embed_tokens\n",
-      "layers\n",
-      "layers.0\n",
-      "layers.0.self_attn\n",
-      "layers.0.self_attn.q_proj\n",
-      "layers.0.self_attn.k_proj\n",
-      "layers.0.self_attn.v_proj\n",
-      "layers.0.self_attn.o_proj\n",
-      "layers.0.self_attn.rotary_emb\n",
-      "layers.0.mlp\n",
-      "layers.0.mlp.gate_proj\n",
-      "layers.0.mlp.up_proj\n",
-      "layers.0.mlp.down_proj\n",
-      "layers.0.mlp.down_proj.base_layer\n",
-      "layers.0.mlp.down_proj.lora_dropout\n",
-      "layers.0.mlp.down_proj.lora_dropout.default\n",
-      "layers.0.mlp.down_proj.lora_A\n",
-      "layers.0.mlp.down_proj.lora_A.default\n",
-      "layers.0.mlp.down_proj.lora_B\n",
-      "layers.0.mlp.down_proj.lora_B.default\n",
-      "layers.0.mlp.down_proj.lora_embedding_A\n",
-      "layers.0.mlp.down_proj.lora_embedding_B\n",
-      "layers.0.mlp.act_fn\n",
-      "layers.0.input_layernorm\n",
-      "layers.0.post_attention_layernorm\n",
-      "layers.1\n",
-      "layers.1.self_attn\n",
-      "layers.1.self_attn.q_proj\n",
-      "layers.1.self_attn.k_proj\n",
-      "layers.1.self_attn.v_proj\n",
-      "layers.1.self_attn.o_proj\n",
-      "layers.1.self_attn.rotary_emb\n",
-      "layers.1.mlp\n",
-      "layers.1.mlp.gate_proj\n",
-      "layers.1.mlp.up_proj\n",
-      "layers.1.mlp.down_proj\n",
-      "layers.1.mlp.down_proj.base_layer\n",
-      "layers.1.mlp.down_proj.lora_dropout\n",
-      "layers.1.mlp.down_proj.lora_dropout.default\n",
-      "layers.1.mlp.down_proj.lora_A\n",
-      "layers.1.mlp.down_proj.lora_A.default\n",
-      "layers.1.mlp.down_proj.lora_B\n",
-      "layers.1.mlp.down_proj.lora_B.default\n",
-      "layers.1.mlp.down_proj.lora_embedding_A\n",
-      "layers.1.mlp.down_proj.lora_embedding_B\n",
-      "layers.1.mlp.act_fn\n",
-      "layers.1.input_layernorm\n",
-      "layers.1.post_attention_layernorm\n",
-      "layers.2\n",
-      "layers.2.self_attn\n",
-      "layers.2.self_attn.q_proj\n",
-      "layers.2.self_attn.k_proj\n",
-      "layers.2.self_attn.v_proj\n",
-      "layers.2.self_attn.o_proj\n",
-      "layers.2.self_attn.rotary_emb\n",
-      "layers.2.mlp\n",
-      "layers.2.mlp.gate_proj\n",
-      "layers.2.mlp.up_proj\n",
-      "layers.2.mlp.down_proj\n",
-      "layers.2.mlp.down_proj.base_layer\n",
-      "layers.2.mlp.down_proj.lora_dropout\n",
-      "layers.2.mlp.down_proj.lora_dropout.default\n",
-      "layers.2.mlp.down_proj.lora_A\n",
-      "layers.2.mlp.down_proj.lora_A.default\n",
-      "layers.2.mlp.down_proj.lora_B\n",
-      "layers.2.mlp.down_proj.lora_B.default\n",
-      "layers.2.mlp.down_proj.lora_embedding_A\n",
-      "layers.2.mlp.down_proj.lora_embedding_B\n",
-      "layers.2.mlp.act_fn\n",
-      "layers.2.input_layernorm\n",
-      "layers.2.post_attention_layernorm\n",
-      "layers.3\n",
-      "layers.3.self_attn\n",
-      "layers.3.self_attn.q_proj\n",
-      "layers.3.self_attn.k_proj\n",
-      "layers.3.self_attn.v_proj\n",
-      "layers.3.self_attn.o_proj\n",
-      "layers.3.self_attn.rotary_emb\n",
-      "layers.3.mlp\n",
-      "layers.3.mlp.gate_proj\n",
-      "layers.3.mlp.up_proj\n",
-      "layers.3.mlp.down_proj\n",
-      "layers.3.mlp.down_proj.base_layer\n",
-      "layers.3.mlp.down_proj.lora_dropout\n",
-      "layers.3.mlp.down_proj.lora_dropout.default\n",
-      "layers.3.mlp.down_proj.lora_A\n",
-      "layers.3.mlp.down_proj.lora_A.default\n",
-      "layers.3.mlp.down_proj.lora_B\n",
-      "layers.3.mlp.down_proj.lora_B.default\n",
-      "layers.3.mlp.down_proj.lora_embedding_A\n",
-      "layers.3.mlp.down_proj.lora_embedding_B\n",
-      "layers.3.mlp.act_fn\n",
-      "layers.3.input_layernorm\n",
-      "layers.3.post_attention_layernorm\n",
-      "layers.4\n",
-      "layers.4.self_attn\n",
-      "layers.4.self_attn.q_proj\n",
-      "layers.4.self_attn.k_proj\n",
-      "layers.4.self_attn.v_proj\n",
-      "layers.4.self_attn.o_proj\n",
-      "layers.4.self_attn.rotary_emb\n",
-      "layers.4.mlp\n",
-      "layers.4.mlp.gate_proj\n",
-      "layers.4.mlp.up_proj\n",
-      "layers.4.mlp.down_proj\n",
-      "layers.4.mlp.down_proj.base_layer\n",
-      "layers.4.mlp.down_proj.lora_dropout\n",
-      "layers.4.mlp.down_proj.lora_dropout.default\n",
-      "layers.4.mlp.down_proj.lora_A\n",
-      "layers.4.mlp.down_proj.lora_A.default\n",
-      "layers.4.mlp.down_proj.lora_B\n",
-      "layers.4.mlp.down_proj.lora_B.default\n",
-      "layers.4.mlp.down_proj.lora_embedding_A\n",
-      "layers.4.mlp.down_proj.lora_embedding_B\n",
-      "layers.4.mlp.act_fn\n",
-      "layers.4.input_layernorm\n",
-      "layers.4.post_attention_layernorm\n",
-      "layers.5\n",
-      "layers.5.self_attn\n",
-      "layers.5.self_attn.q_proj\n",
-      "layers.5.self_attn.k_proj\n",
-      "layers.5.self_attn.v_proj\n",
-      "layers.5.self_attn.o_proj\n",
-      "layers.5.self_attn.rotary_emb\n",
-      "layers.5.mlp\n",
-      "layers.5.mlp.gate_proj\n",
-      "layers.5.mlp.up_proj\n",
-      "layers.5.mlp.down_proj\n",
-      "layers.5.mlp.down_proj.base_layer\n",
-      "layers.5.mlp.down_proj.lora_dropout\n",
-      "layers.5.mlp.down_proj.lora_dropout.default\n",
-      "layers.5.mlp.down_proj.lora_A\n",
-      "layers.5.mlp.down_proj.lora_A.default\n",
-      "layers.5.mlp.down_proj.lora_B\n",
-      "layers.5.mlp.down_proj.lora_B.default\n",
-      "layers.5.mlp.down_proj.lora_embedding_A\n",
-      "layers.5.mlp.down_proj.lora_embedding_B\n",
-      "layers.5.mlp.act_fn\n",
-      "layers.5.input_layernorm\n",
-      "layers.5.post_attention_layernorm\n",
-      "layers.6\n",
-      "layers.6.self_attn\n",
-      "layers.6.self_attn.q_proj\n",
-      "layers.6.self_attn.k_proj\n",
-      "layers.6.self_attn.v_proj\n",
-      "layers.6.self_attn.o_proj\n",
-      "layers.6.self_attn.rotary_emb\n",
-      "layers.6.mlp\n",
-      "layers.6.mlp.gate_proj\n",
-      "layers.6.mlp.up_proj\n",
-      "layers.6.mlp.down_proj\n",
-      "layers.6.mlp.down_proj.base_layer\n",
-      "layers.6.mlp.down_proj.lora_dropout\n",
-      "layers.6.mlp.down_proj.lora_dropout.default\n",
-      "layers.6.mlp.down_proj.lora_A\n",
-      "layers.6.mlp.down_proj.lora_A.default\n",
-      "layers.6.mlp.down_proj.lora_B\n",
-      "layers.6.mlp.down_proj.lora_B.default\n",
-      "layers.6.mlp.down_proj.lora_embedding_A\n",
-      "layers.6.mlp.down_proj.lora_embedding_B\n",
-      "layers.6.mlp.act_fn\n",
-      "layers.6.input_layernorm\n",
-      "layers.6.post_attention_layernorm\n",
-      "layers.7\n",
-      "layers.7.self_attn\n",
-      "layers.7.self_attn.q_proj\n",
-      "layers.7.self_attn.k_proj\n",
-      "layers.7.self_attn.v_proj\n",
-      "layers.7.self_attn.o_proj\n",
-      "layers.7.self_attn.rotary_emb\n",
-      "layers.7.mlp\n",
-      "layers.7.mlp.gate_proj\n",
-      "layers.7.mlp.up_proj\n",
-      "layers.7.mlp.down_proj\n",
-      "layers.7.mlp.down_proj.base_layer\n",
-      "layers.7.mlp.down_proj.lora_dropout\n",
-      "layers.7.mlp.down_proj.lora_dropout.default\n",
-      "layers.7.mlp.down_proj.lora_A\n",
-      "layers.7.mlp.down_proj.lora_A.default\n",
-      "layers.7.mlp.down_proj.lora_B\n",
-      "layers.7.mlp.down_proj.lora_B.default\n",
-      "layers.7.mlp.down_proj.lora_embedding_A\n",
-      "layers.7.mlp.down_proj.lora_embedding_B\n",
-      "layers.7.mlp.act_fn\n",
-      "layers.7.input_layernorm\n",
-      "layers.7.post_attention_layernorm\n",
-      "layers.8\n",
-      "layers.8.self_attn\n",
-      "layers.8.self_attn.q_proj\n",
-      "layers.8.self_attn.k_proj\n",
-      "layers.8.self_attn.v_proj\n",
-      "layers.8.self_attn.o_proj\n",
-      "layers.8.self_attn.rotary_emb\n",
-      "layers.8.mlp\n",
-      "layers.8.mlp.gate_proj\n",
-      "layers.8.mlp.up_proj\n",
-      "layers.8.mlp.down_proj\n",
-      "layers.8.mlp.down_proj.base_layer\n",
-      "layers.8.mlp.down_proj.lora_dropout\n",
-      "layers.8.mlp.down_proj.lora_dropout.default\n",
-      "layers.8.mlp.down_proj.lora_A\n",
-      "layers.8.mlp.down_proj.lora_A.default\n",
-      "layers.8.mlp.down_proj.lora_B\n",
-      "layers.8.mlp.down_proj.lora_B.default\n",
-      "layers.8.mlp.down_proj.lora_embedding_A\n",
-      "layers.8.mlp.down_proj.lora_embedding_B\n",
-      "layers.8.mlp.act_fn\n",
-      "layers.8.input_layernorm\n",
-      "layers.8.post_attention_layernorm\n",
-      "layers.9\n",
-      "layers.9.self_attn\n",
-      "layers.9.self_attn.q_proj\n",
-      "layers.9.self_attn.k_proj\n",
-      "layers.9.self_attn.v_proj\n",
-      "layers.9.self_attn.o_proj\n",
-      "layers.9.self_attn.rotary_emb\n",
-      "layers.9.mlp\n",
-      "layers.9.mlp.gate_proj\n",
-      "layers.9.mlp.up_proj\n",
-      "layers.9.mlp.down_proj\n",
-      "layers.9.mlp.down_proj.base_layer\n",
-      "layers.9.mlp.down_proj.lora_dropout\n",
-      "layers.9.mlp.down_proj.lora_dropout.default\n",
-      "layers.9.mlp.down_proj.lora_A\n",
-      "layers.9.mlp.down_proj.lora_A.default\n",
-      "layers.9.mlp.down_proj.lora_B\n",
-      "layers.9.mlp.down_proj.lora_B.default\n",
-      "layers.9.mlp.down_proj.lora_embedding_A\n",
-      "layers.9.mlp.down_proj.lora_embedding_B\n",
-      "layers.9.mlp.act_fn\n",
-      "layers.9.input_layernorm\n",
-      "layers.9.post_attention_layernorm\n",
-      "layers.10\n",
-      "layers.10.self_attn\n",
-      "layers.10.self_attn.q_proj\n",
-      "layers.10.self_attn.k_proj\n",
-      "layers.10.self_attn.v_proj\n",
-      "layers.10.self_attn.o_proj\n",
-      "layers.10.self_attn.rotary_emb\n",
-      "layers.10.mlp\n",
-      "layers.10.mlp.gate_proj\n",
-      "layers.10.mlp.up_proj\n",
-      "layers.10.mlp.down_proj\n",
-      "layers.10.mlp.down_proj.base_layer\n",
-      "layers.10.mlp.down_proj.lora_dropout\n",
-      "layers.10.mlp.down_proj.lora_dropout.default\n",
-      "layers.10.mlp.down_proj.lora_A\n",
-      "layers.10.mlp.down_proj.lora_A.default\n",
-      "layers.10.mlp.down_proj.lora_B\n",
-      "layers.10.mlp.down_proj.lora_B.default\n",
-      "layers.10.mlp.down_proj.lora_embedding_A\n",
-      "layers.10.mlp.down_proj.lora_embedding_B\n",
-      "layers.10.mlp.act_fn\n",
-      "layers.10.input_layernorm\n",
-      "layers.10.post_attention_layernorm\n",
-      "layers.11\n",
-      "layers.11.self_attn\n",
-      "layers.11.self_attn.q_proj\n",
-      "layers.11.self_attn.k_proj\n",
-      "layers.11.self_attn.v_proj\n",
-      "layers.11.self_attn.o_proj\n",
-      "layers.11.self_attn.rotary_emb\n",
-      "layers.11.mlp\n",
-      "layers.11.mlp.gate_proj\n",
-      "layers.11.mlp.up_proj\n",
-      "layers.11.mlp.down_proj\n",
-      "layers.11.mlp.down_proj.base_layer\n",
-      "layers.11.mlp.down_proj.lora_dropout\n",
-      "layers.11.mlp.down_proj.lora_dropout.default\n",
-      "layers.11.mlp.down_proj.lora_A\n",
-      "layers.11.mlp.down_proj.lora_A.default\n",
-      "layers.11.mlp.down_proj.lora_B\n",
-      "layers.11.mlp.down_proj.lora_B.default\n",
-      "layers.11.mlp.down_proj.lora_embedding_A\n",
-      "layers.11.mlp.down_proj.lora_embedding_B\n",
-      "layers.11.mlp.act_fn\n",
-      "layers.11.input_layernorm\n",
-      "layers.11.post_attention_layernorm\n",
-      "norm\n"
+      "{'down_proj'}\n"
      ]
     }
    ],
    "source": [
-    "named_modules = [name.replace(\"base_model.model.model.\", \"\") for name, _ in model.named_modules() if \"base_model.model.model.\" in name]\n",
-    "for x in named_modules:\n",
-    "    print(x)"
+    "print(model.peft_config['default'].target_modules)"
    ]
   },
   {
@@ -496,7 +570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -519,7 +593,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
@@ -535,20 +609,170 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
-      "/usr/FlexFlow/tests/peft/hf_peft_tensors/layers.0.mlp.down_proj.lora_A.default.weight True\n",
-      "/usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers_0_feed_forward_w2_lora_shard_0_weight_A False\n"
-     ]
-    },
-    {
-     "ename": "AssertionError",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[20], line 40\u001b[0m\n\u001b[1;32m     38\u001b[0m hf_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     39\u001b[0m ff_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_A\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 40\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_lora_A_weight_fp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_lora_A_weight_fp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     41\u001b[0m hf_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_B.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     42\u001b[0m ff_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_B\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
-      "File \u001b[0;32m/usr/FlexFlow/tests/peft/alignment/align_test_utils.py:24\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28mprint\u001b[39m(hf_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(hf_tensor_filepath))\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28mprint\u001b[39m(ff_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(ff_tensor_filepath))\n\u001b[0;32m---> 24\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m     25\u001b[0m hf_tensor \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_tensor_filepath)\n\u001b[1;32m     26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n",
-      "\u001b[0;31mAssertionError\u001b[0m: "
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.10.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.input_layernorm_shard_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.06630182  6.3429456\n",
+      " -0.21220279]\n",
+      "FF:[ 0.          0.          0.         ...  0.06630275  6.34293985\n",
+      " -0.21219885]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[15889]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.input_layernorm_shard_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.14172177  9.79423\n",
+      " -6.2940273 ]\n",
+      "FF:[ 0.          0.          0.         ...  0.14172006  9.79421902\n",
+      " -6.29402065]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 2878  3206  3367  3607  5183  5346  6257  6544  7466  7679  7805  8119\n",
+      "  8159  8911  9450  9897 13696 13938 14058 14599 15126 15839 16128 16195]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
      ]
     }
    ],
@@ -557,31 +781,33 @@
     "for i in range(tot_num_layers):\n",
     "    hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n",
     "    ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n",
-    "    compare_tensors(hf_input_ln_in, ff_input_ln_in)\n",
+    "    if i > 0:\n",
+    "        ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
+    "    compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)\n",
     "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n",
     "    ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
     "    if i > 0:\n",
-    "        ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_norm_shard_0_output_1\"\n",
-    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "        ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)\n",
     "    hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n",
     "    ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n",
-    "    compare_tensors(hf_attn_out, ff_attn_out)\n",
+    "    compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)\n",
     "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n",
     "    ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n",
-    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)\n",
     "    # w1\n",
     "    hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
     "    ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n",
-    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n",
+    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)\n",
     "    # w3\n",
     "    hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n",
-    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_0_layers.0.mlp.up_proj_shard_0_output_0\"\n",
-    "    compare_tensors(hf_up_proj_out, ff_up_proj_out)\n",
+    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)\n",
     "    # w2\n",
     "    hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n",
     "    hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n",
     "    ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n",
-    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0\"\n",
     "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
     "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
     "    # LORA input\n",
@@ -591,10 +817,10 @@
     "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
     "    # LORA weights\n",
     "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
-    "    ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A\"\n",
     "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
     "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
-    "    ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B\"\n",
     "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
     "    # LORA intermediate hf\n",
     "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n",
@@ -602,7 +828,7 @@
     "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
     "    # LORA output\n",
     "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n",
-    "    ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0\"\n",
     "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
     "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
     "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
@@ -612,10 +838,10 @@
     "# After last layer only\n",
     "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
     "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n",
-    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)\n",
     "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
-    "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n",
-    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+    "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)"
    ]
   },
   {

From 95462392bbc58ee1110c4aa2c8f3b01526e5683a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 19 May 2024 07:14:26 +0000
Subject: [PATCH 184/198] fix cross entropy loss bug

---
 src/ops/kernels/softmax.cu        | 13 ++++++++-----
 tests/peft/hf_finetune.py         | 18 ------------------
 tests/peft/peft_alignment_test.py | 10 +++++-----
 3 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index c8bc242af0..16f1219bf6 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -295,9 +295,11 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward(
     int num_tokens,
     int num_classes) {
   CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
+    int class_idx = i % num_classes;
+    int token_idx = i / num_classes;
     input_grad[i] = output_grad[i];
-    if (i % num_classes == token_ids[i / num_classes]) {
-      input_grad[i] -= 1.0f;
+    if (class_idx == token_ids[token_idx]) {
+      input_grad[i] = input_grad[i] - (DT)1.0f;
     }
   }
 }
@@ -320,9 +322,10 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
       tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
       continue;
     }
-    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+    // shift labels by 1 position to the left (ignore first token label)
     for (int j = 0; j < num_bwd_tokens; j++) {
-      token_ids[j] = bc->labelsInfo[j + tokens_previous_requests].token_id;
+      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
     }
 
     DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
@@ -359,7 +362,7 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
                              DT(0.0),
                              scale_factor);
 
-    tokens_previous_requests += num_bwd_tokens;
+    tokens_previous_requests += num_bwd_tokens + 1;
   }
   assert(tokens_previous_requests == bc->num_active_tokens());
 }
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index cccb7cf11c..aef2bdb524 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -44,21 +44,6 @@ def print_trainable_parameters(model):
     )
 
 
-def lm_head_pre_backward_hook(module, grad_output):
-    # Fill grad input tensor with 0.5 to align other layers without having to align loss
-    assert len(grad_output) == 1
-    assert "lm_head" in module.name
-    name = module.name.replace("base_model.model.model.", "")
-    print(
-        f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}"
-    )
-    print(grad_output[0].shape)
-    dev = grad_output[0].device
-    new_grad_output = torch.full(grad_output[0].shape, 0.5).to(dev)
-    assert new_grad_output.shape == grad_output[0].shape
-    return (new_grad_output,)
-
-
 def peft_backward_hook(module, grad_input, grad_output):
     assert(type(grad_input) == tuple and type(grad_output) == tuple)
     if len(grad_input) == 0 or len(grad_output) == 0:
@@ -247,9 +232,6 @@ def main():
             print(f"Adding hooks to layer {layer.name}")
             layer.register_forward_hook(peft_forward_hook)
             layer.register_full_backward_hook(peft_backward_hook)
-            # TODO: remove hard-coding of lm head grad input after aligning the loss
-            if "lm_head" in name:
-                layer.register_full_backward_pre_hook(lm_head_pre_backward_hook)
         # Save any weights of interest
         for name, params in model.named_parameters():
             simplified_name = name.replace("base_model.model.model.", "")
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index f07c65140b..c93fc0e0b0 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -122,14 +122,14 @@ def check_bwd_pass(tot_num_layers = 12):
     # ff_BWD_softmax_in = f"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0"
     print("-- LM head --")
     hf_BWD_lm_head_out = f"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0"
-    ff_BWD_lm_head_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0"
+    ff_BWD_lm_head_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0"
     compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)
     # compare weights
-    hf_lm_head_weight = f"{hf_path}/base_model.model.lm_head.weight"
-    ff_lm_head_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0"
-    compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)
+    # hf_lm_head_weight = f"{hf_path}/base_model.model.lm_head.weight"
+    # ff_lm_head_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0"
+    # compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)
     hf_BWD_lm_head_in = f"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0"
-    ff_BWD_lm_head_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0"
+    ff_BWD_lm_head_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_input_0"
     compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)
     # # Manually check the matmul
     # ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')

From ff4b703f5b37bd3932227b5e4df35d051d9810ff Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 19 May 2024 20:03:47 +0000
Subject: [PATCH 185/198] update alignment test

---
 tests/peft/peft_alignment_test.py | 114 ++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index c93fc0e0b0..2d4a7cb353 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -152,6 +152,120 @@ def check_bwd_pass(tot_num_layers = 12):
     ff_BWD_norm_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1"
     compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)
 
+    print("-- Transformers blocks --")
+    for i in range(tot_num_layers-1, -1, -1):
+        # HuggingFace filepaths
+        hf_BWD_norm_in = f"{hf_path}/bwd_step_0_norm.gi_0"
+        hf_BWD_loraB_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0"
+        hf_BWD_loraB_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0"
+        hf_BWD_loraA_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0"
+        hf_BWD_loraA_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0"
+        hf_loraA_weight = f"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight"
+        hf_loraB_weight = f"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight"
+        hf_BWD_w2_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0"
+        hf_BWD_w2_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0"
+        hf_w2_weight = f"{hf_path}/layers.{i}.mlp.down_proj.base_layer.weight"
+        hf_BWD_w3_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0"
+        hf_BWD_w3_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0"
+        hf_BWD_w1_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0"
+        hf_BWD_w1_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0"
+        hf_BWD_act_fn_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0"
+        hf_BWD_ffn_norm_out = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0"
+        hf_BWD_ffn_norm_in = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0"
+        hf_BWD_attn_out_out = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0"
+        
+        # FlexFlow filepaths
+        ff_BWD_w2_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0"
+        ff_BWD_w2_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0"
+        ff_BWD_w2_in_pre = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_pre_input_0"
+        ff_w2_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_weight_0"
+        ff_BWD_ssm_out = f"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0"
+        ff_BWD_ssm_in1 = f"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0"
+        ff_BWD_ssm_in2 = f"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1"
+        ff_BWD_w3_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0"
+        ff_BWD_w3_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_input_0"
+        ff_BWD_lora_A_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0"
+        ff_BWD_lora_B_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0"
+        ff_lora_A_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A"
+        ff_lora_B_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B"
+        ff_BWD_w1_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0"
+        ff_BWD_w1_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_input_0"
+        ff_BWD_w1_in_pre = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_pre_input_0"
+        ff_BWD_ffn_norm_in1 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_input_0"
+        ff_BWD_ffn_norm_in2 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_input_1"
+        ff_BWD_ffn_norm_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_0"
+        ff_BWD_attn_out = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0"        
+        
+        # HuggingFace checks
+        print("\nHuggingface checks:")
+        if i == tot_num_layers-1:
+            compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)
+            compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)
+        compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)
+        compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)
+
+        compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)
+        check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)
+        if i == tot_num_layers-1:
+            check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)
+
+        # FlexFlow checks
+        print("\nFlexFlow checks:")
+        compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)
+        compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)
+        compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)
+        compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)
+        compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)
+        # compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)
+        # compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)
+        # compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)
+        
+        # HF-FlexFlow checks
+        print("\nHuggingface-FlexFlow checks:")
+        print("-- W2 --")
+        compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)
+        compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)
+        
+        print("-- Lora --")
+        compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)
+        compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)
+
+        compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)
+        compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)
+        
+        print("-- W2/W1/W3 --")
+        compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)
+        compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)
+        compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)
+        compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)
+        compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)
+        compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)
+        compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)
+        
+        print("-- Attention --")
+        compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)
+        num_tokens = 24
+
+        hf_attn_in = f"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0"
+        hf_attn_in = torch.load(hf_attn_in)
+        hf_attn_in = hf_attn_in.squeeze().T
+        hf_attn_in = hf_attn_in.detach().cpu().numpy()
+        print("hf_attn_in: ", hf_attn_in.shape)
+        print(hf_attn_in)
+
+        ff_attn_in = f"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in"
+        ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')
+        print("ff_attn_in: ", ff_attn_in.shape)
+        print(ff_attn_in)
+        #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))
+
+        mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))
+        mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]
+        pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])
+        print(f"{pct_mismatch*100}% mismatch in attention input grads")
+        assert(pct_mismatch <= 0.1)
+
+
 if __name__ == "__main__":
     check_weights_alignment()
     check_fwd_pass()

From b613666dc8e5603fb534fd8bf7897b7f010fa2b5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 20 May 2024 01:21:36 +0000
Subject: [PATCH 186/198] update test

---
 tests/peft/alignment/align_test_utils.py | 155 ++++++++++++++++++++---
 tests/peft/peft_alignment_test.py        |  97 +++++++++++---
 2 files changed, 218 insertions(+), 34 deletions(-)

diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index 24da900fcb..4d202a3cc5 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -1,5 +1,6 @@
 import os, re, torch
 import numpy as np
+from typing import List
 abs_dirname = os.path.dirname(os.path.abspath(__file__))
 hf_path = os.path.join(os.path.dirname(abs_dirname), "hf_peft_tensors")
 ff_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(abs_dirname))), "build", "inference_tensors")
@@ -17,7 +18,18 @@ def print_unique_files_list(dirname):
                 if layer_num > 0 and layer_num != 100:
                     files_list.remove(f)
     return sorted(files_list)
-def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):
+def compare_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, tolerance=1e-2):
+    """Check whether a HuggingFace tensor and a FlexFlow tensor are equal
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor_filepath (str): The file path of the FlexFlow tensor
+        tolerance (float, optional): Floating-point error tolerance for the checks. Defaults to 1e-2.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """    
     if not os.path.exists(hf_tensor_filepath):
         raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found")
     if not os.path.exists(ff_tensor_filepath):
@@ -46,7 +58,15 @@ def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):
     #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
     assert(len(mismatches) <= .05*len_hf_tensor)
     print("Ok!")
-def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):
+def compare_tensors_difference(hf_tensor_filepath: str, ff_tensor1_filepath: str, ff_tensor2_filepath: str, tolerance: float = 1e-2):
+    """Check whether a HuggingFace tensor is equal to the difference between two FlexFlow tensors
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor1_filepath (str): The file path of the first FlexFlow tensor
+        ff_tensor2_filepath (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): The floating-point error tolerance for the equality check. Defaults to 1e-2.
+    """    
     assert(os.path.exists(hf_tensor_filepath))
     assert(os.path.exists(ff_tensor1_filepath))
     assert(os.path.exists(ff_tensor2_filepath))
@@ -77,8 +97,17 @@ def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tenso
     #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
     assert(len(mismatches) <= .05*len_hf_tensor)
     print("Ok!")
-def compare_hf_tensors(tensor1_fp, tensor2_fp):
-    assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))
+def compare_hf_tensors(tensor1_fp: str, tensor2_fp: str):
+    """Checks whether two HuggingFace tensors are equal
+
+    Args:
+        tensor1_fp (str): The file path of the first tensor
+        tensor2_fp (str): The file path of the second tensor
+    """    
+    if not os.path.exists(tensor1_fp):
+        raise FileNotFoundError(f"HF tensor file: {tensor1_fp} not found")
+    if not os.path.exists(tensor2_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found")
     hf_tensor1 = torch.load(tensor1_fp)
     hf_tensor2 = torch.load(tensor2_fp)
     if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
@@ -100,8 +129,20 @@ def compare_hf_tensors(tensor1_fp, tensor2_fp):
         assert(False)
     print("Ok!")
 
-def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):
-    assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))
+def check_hf_sum_tensors(tensor_sum_fp: str, tensor1_fp: str, tensor2_fp: str):
+    """Checks whether a HuggingFace tensor is equal to the sum of two other HuggingFace tensors
+
+    Args:
+        tensor_sum_fp (str): The file path of the sum tensor
+        tensor1_fp (str): The file path of the first tensor
+        tensor2_fp (str): The file path of the second tensor
+    """
+    if not os.path.exists(tensor_sum_fp):
+        raise FileNotFoundError(f"HF tensor file: {tensor_sum_fp} not found")
+    if not os.path.exists(tensor1_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor1_fp} not found")
+    if not os.path.exists(tensor2_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found")
     hf_tensor_sum = torch.load(tensor_sum_fp)
     hf_tensor1 = torch.load(tensor1_fp)
     hf_tensor2 = torch.load(tensor2_fp)
@@ -131,14 +172,27 @@ def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):
         print(mismatches)
         assert(False)
     print("Ok!")
-def check_hf_zero_tensor(hf_tensor_fp):
-    assert(os.path.exists(hf_tensor_fp))
+def check_hf_zero_tensor(hf_tensor_fp: str):
+    """Check whether a HuggingFace tensor is a zero tensor
+
+    Args:
+        hf_tensor_fp (str): The file path of the HuggingFace tensor
+    """    
+    if not os.path.exists(hf_tensor_fp):
+        raise FileNotFoundError(f"HF tensor file: {hf_tensor_fp} not found")
     hf_tensor1 = torch.load(hf_tensor_fp)
     if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
         assert(len(hf_tensor1) == 1)
         hf_tensor1 = hf_tensor1[0]
     assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)
-def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=""):
+def print_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, txt: str = ""):
+    """Print the contents of a HuggingFace tensor and a FlexFlow tensor
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor_filepath (str): The file path of the FlexFlow tensor
+        txt (str, optional): Additional text to prepend to the tensors. Defaults to "".
+    """    
     assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))
     hf_tensor = torch.load(hf_tensor_filepath)
     if type(hf_tensor) == tuple or type(hf_tensor) == list:
@@ -155,7 +209,23 @@ def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=""):
     print(hf_tensor)
     print(f"{txt} - FF tensor: ")
     print(ff_tensor)
-def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):
+def compare_flexflow_tensors(ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5, max_len: int = -1):
+    """Check whether two FlexFlow tensors are equal
+
+    Args:
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): Floating-point error tolernace for the check. Defaults to 1e-5.
+        max_len (int, optional): Maximum number of elements to check (if > 0). Defaults to -1.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """    
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
     assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))
     ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')
     ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')
@@ -178,8 +248,22 @@ def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_l
     #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
     assert(len(mismatches) <= .05*len(ff_tensor1))
     print("Ok!")
-def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):
-    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))
+def compare_flexflow_tensors_shortest(ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5):
+    """Compare two FlexFlow tensors up to the maximum length of the shortest tensor
+
+    Args:
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): Floating point error tolerance for the check. Defaults to 1e-5.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """    
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
     ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')
     ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')
     minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])
@@ -195,8 +279,23 @@ def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e
     #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
     assert(len(mismatches) <= .05*len(ff_tensor1))
     print("Ok!")
-def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):
-    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))
+def check_flexflow_tensors_sum(ff_tensor_sum_fp: str, ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance=1e-5):
+    """Check whether a FlexFlow tensor is equal to the sum of two other FlexFlow tensors
+
+    Args:
+        ff_tensor_sum_fp (str): The file path of the FlexFlow sum tensor
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (_type_, optional): Floating-point error tolerance for the check. Defaults to 1e-5.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """    
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
     ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')
     ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')
     ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')
@@ -215,18 +314,42 @@ def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, t
     #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
     assert(len(mismatches) <= .05*len(ff_tensor1))
     print("Ok!")
-def load_ff_tensor(filename, shape):
+def load_ff_tensor(filename: str, shape: List[int]):
+    """Load a FlexFlow tensor from a file as a numpy array
+
+    Args:
+        filename (str): The file path of the FF tensor
+        shape (List[int]): The shape of the FF tensor
+
+    Returns:
+        _type_: The FF tensor as a numpy array
+    """    
     if ff_path not in filename:
         filename = os.path.join(ff_path, filename)
     ff_tensor = np.loadtxt(filename, delimiter=',').reshape(shape, order = 'F')
     return ff_tensor
-def load_hf_tensor(filename):
+def load_hf_tensor(filename: str):
+    """Load a HuggingFace tensor from a file as a numpy array
+
+    Args:
+        filename (str): The file path of the HF tensor
+
+    Returns:
+        _type_: The HF tensor as a numpy array
+    """    
     if hf_path not in filename:
         filename = os.path.join(hf_path, filename)
     hf_tensor = torch.load(filename)
     hf_tensor = hf_tensor.detach().cpu().numpy()
     return hf_tensor
 def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2):
+    """Check whether a Huggingface and a FlexFlow tensors, both loaded to memory in the form of a numpy array, are equal
+
+    Args:
+        hf_tensor (_type_): The HuggingFace tensor (in numpy array form)
+        ff_tensor (_type_): The FlexFlow tensor (in numpy array form)
+        tolerance (_type_, optional): The floating point error tolerance for the check. Defaults to 1e-2.
+    """    
     assert(hf_tensor.shape == ff_tensor.shape)
     mismatches = []
     if not np.allclose(hf_tensor, ff_tensor, atol=tolerance):
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index 2d4a7cb353..c75f6f7d3f 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -173,6 +173,15 @@ def check_bwd_pass(tot_num_layers = 12):
         hf_BWD_ffn_norm_out = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0"
         hf_BWD_ffn_norm_in = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0"
         hf_BWD_attn_out_out = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0"
+        hf_BWD_attn_q_in = f"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0"
+        hf_FWD_w1_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0"
+        hf_FWD_w3_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0"
+        hf_FWD_act_fn_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0"
+        hf_BWD_attn_oproj_in = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0"
+        hf_attn_qproj_weight = f"{hf_path}/layers.{i}.self_attn.q_proj.weight"
+        hf_attn_kproj_weight = f"{hf_path}/layers.{i}.self_attn.k_proj.weight"
+        hf_attn_vproj_weight = f"{hf_path}/layers.{i}.self_attn.v_proj.weight"
+        hf_attn_oproj_weight = f"{hf_path}/layers.{i}.self_attn.o_proj.weight"
         
         # FlexFlow filepaths
         ff_BWD_w2_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0"
@@ -195,7 +204,9 @@ def check_bwd_pass(tot_num_layers = 12):
         ff_BWD_ffn_norm_in2 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_input_1"
         ff_BWD_ffn_norm_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_0"
         ff_BWD_attn_out = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0"        
-        
+        ff_BWD_attn_o_proj_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_o_proj_in_grad"
+        ff_attn_oproj_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_weight_0"
+
         # HuggingFace checks
         print("\nHuggingface checks:")
         if i == tot_num_layers-1:
@@ -217,7 +228,7 @@ def check_bwd_pass(tot_num_layers = 12):
         compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)
         compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)
         # compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)
-        # compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)
+        compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)
         # compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)
         
         # HF-FlexFlow checks
@@ -243,27 +254,77 @@ def check_bwd_pass(tot_num_layers = 12):
         compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)
         
         print("-- Attention --")
-        compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)
         num_tokens = 24
+        hidden_size = 768
+        qProjSize = 64
+        num_heads = 12
+        # Check output
+        compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)
+        
+        # Check weights
+        ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')
+        ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')
+        ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')
+        ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')
+        ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')
+        
+        hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()
+        hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()
+        hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()
+        hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()
+        
+        assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))
+        assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))
+        assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))
+        assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))
 
+        # Compare attn outproj grad in tensors
+        compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)
+
+        # Compare vproj grads
+        hf_vproj_grads = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0"
+        ff_vproj_grads = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_v_proj_in_grad"
+        hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()
+        ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')
+        compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)
+
+        # Compare kproj grads
+        ff_kproj = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_devkproj"
+        ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')
+        hf_kproj_grads = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0"
+        hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()
+        reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()
+        assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))
+        print("Ok!")
+
+        # Compare qproj grads
+        hf_qproj_grads = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0"
+        hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()
+        reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()
+        ff_qproj = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_devQKVPRojArray"
+        ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]
+        assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))
+        print("Ok!")
+
+        # Compare attn grad input 
         hf_attn_in = f"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0"
-        hf_attn_in = torch.load(hf_attn_in)
-        hf_attn_in = hf_attn_in.squeeze().T
-        hf_attn_in = hf_attn_in.detach().cpu().numpy()
-        print("hf_attn_in: ", hf_attn_in.shape)
-        print(hf_attn_in)
+        ff_attn_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_attn_final_grad_in"
+        compare_tensors(hf_attn_in, ff_attn_in)
 
-        ff_attn_in = f"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in"
-        ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')
-        print("ff_attn_in: ", ff_attn_in.shape)
-        print(ff_attn_in)
-        #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))
+        # compare input layernorm
+        print("-- Input LayerNorm --")
+        if i > 0:
+            ff_input_ln_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1"
+            ff_attn_operator_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_input_0"
+            compare_flexflow_tensors(ff_attn_operator_in, ff_input_ln_out)
+            hf_input_ln_in = f"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.gi_0"
+            ff_input_ln_in0 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0"
+            ff_input_ln_in1 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_1"
+            compare_flexflow_tensors(ff_input_ln_in0, ff_input_ln_in1)
+            if i > 1:
+                compare_tensors(hf_input_ln_in, ff_input_ln_in0)
+        
 
-        mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))
-        mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]
-        pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])
-        print(f"{pct_mismatch*100}% mismatch in attention input grads")
-        assert(pct_mismatch <= 0.1)
 
 
 if __name__ == "__main__":

From dde0b61d28e3b7f5dd00ab7236e2d12d5cc20c74 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 20 May 2024 01:24:49 +0000
Subject: [PATCH 187/198] add llama peft alignment test to ci

---
 .github/workflows/gpu-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index b5260ead05..b78df90a5f 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -177,6 +177,7 @@ jobs:
 
           # PEFT tests
           ./tests/peft_tests.sh
+          python ./tests/peft/peft_alignment_test.py
       
       - name: Save inference output as an artifact
         if: always()

From 1a31b65e5cb8de00c251d3bd53c8858d4a0e71cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <remi.delacourt@gmail.com>
Date: Fri, 24 May 2024 21:41:19 +0000
Subject: [PATCH 188/198] Fix values for unused params in incr_decoding

---
 python/flexflow/core/flexflow_cffi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index ec4cacfa6d..fdbab8eb89 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -4406,8 +4406,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1
             for prompt in prompt_list
         ]
         max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
-        peft_model_ids = [None for prompt in prompt_list]
-        dataset_filepaths = [None for prompt in prompt_list]
+        peft_model_ids = [PEFTModelID().handle for prompt in prompt_list] # Assign Dummy model ids
+        dataset_filepaths = [ffi.NULL for prompt in prompt_list]
         training_steps = [0 for prompt in prompt_list]
         ffc().flexflow_model_generate(
             self.handle,

From 7e3d1111e35f3be7f3a091e1bb00edbfba5195cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <remi.delacourt@gmail.com>
Date: Fri, 24 May 2024 23:30:50 +0000
Subject: [PATCH 189/198] Add PEFTModelID NO_ID singleton instead of None

---
 include/flexflow/flexflow_c.h         |  2 ++
 python/flexflow/core/flexflow_cffi.py | 14 ++++++++++++--
 src/c/flexflow_c.cc                   |  6 ++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index b651b31052..97a382ee8b 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -1068,6 +1068,8 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create();
 
 flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
 
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id();
+
 void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
 
 #ifdef __cplusplus
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index fdbab8eb89..aa414f74d7 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1766,6 +1766,8 @@ def __init__(
 class PEFTModelID(object):
     __slots__ = ["handle", "_handle"]
 
+    __no_id_h = None
+
     def __init__(self, id=None):
         if id is None:
             self.handle = ffc().flexflow_peft_model_id_create()
@@ -1773,6 +1775,11 @@ def __init__(self, id=None):
             self.handle = ffc().flexflow_peft_model_id_create_id(id)
         self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
 
+    @staticmethod
+    def no_id_handle():
+        if PEFTModelID.__no_id_h is None:
+            PEFTModelID.__no_id_h = ffc().flexflow_peft_model_id_no_id()
+        return PEFTModelID.__no_id_h
 
 # -----------------------------------------------------------------------
 # Request
@@ -4406,7 +4413,7 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1
             for prompt in prompt_list
         ]
         max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
-        peft_model_ids = [PEFTModelID().handle for prompt in prompt_list] # Assign Dummy model ids
+        peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
         dataset_filepaths = [ffi.NULL for prompt in prompt_list]
         training_steps = [0 for prompt in prompt_list]
         ffc().flexflow_model_generate(
@@ -4451,7 +4458,10 @@ def generate(self, requests_list: List[Request]):
         max_sequence_lengths = [
             request.max_sequence_length for request in requests_list
         ]
-        peft_model_ids = [request.peft_model_id for request in requests_list]
+        peft_model_ids = [
+            (request.peft_model_id 
+             if request.peft_model_id is not None else PEFTModelID.no_id_handle()) 
+             for request in requests_list]
         dataset_filepaths = [
             get_c_name(request.dataset_filepath) for request in requests_list
         ]
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 993d1b6a0d..e5f42c7df8 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2845,6 +2845,12 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
   return FFCObjectWrapper::wrap(handle);
 }
 
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id() {
+  PEFTModelID handle = PEFTModelID::NO_ID;
+  DEBUG_PRINT("[PEFTModelID] new %p", &handle);
+  return FFCObjectWrapper::wrap(&handle);
+}
+
 void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
   PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_);
   DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);

From 079ba5932360158928ad73db5a20e5ea7515c6aa Mon Sep 17 00:00:00 2001
From: Remi <54138269+Flechman@users.noreply.github.com>
Date: Fri, 24 May 2024 22:50:46 -0400
Subject: [PATCH 190/198] Fix PEFTModelID::NO_ID reference

---
 src/c/flexflow_c.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e5f42c7df8..43fcd55a02 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2846,9 +2846,9 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
 }
 
 flexflow_peft_model_id_t flexflow_peft_model_id_no_id() {
-  PEFTModelID handle = PEFTModelID::NO_ID;
-  DEBUG_PRINT("[PEFTModelID] new %p", &handle);
-  return FFCObjectWrapper::wrap(&handle);
+  PEFTModelID *handle = const_cast<PEFTModelID*>(&PEFTModelID::NO_ID);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
 }
 
 void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {

From f464eb8911f2b845ea72ef3bf6e985bac07405e2 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 25 May 2024 20:27:09 +0000
Subject: [PATCH 191/198] reduce logging

---
 src/runtime/graph.cc             | 19 +++++++-------
 src/runtime/inference_manager.cc | 43 ++++++++++++++++----------------
 src/runtime/model.cc             |  2 +-
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index dae0021bb6..e5b1eb3631 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2480,6 +2480,7 @@ namespace FlexFlow {
 using PCG::Edge;
 using PCG::Graph;
 using PCG::GraphCostResult;
+using PCG::log_graph;
 using PCG::Node;
 
 void FFModel::register_all_machine_views(
@@ -3158,20 +3159,20 @@ void FFModel::deserialize_graph_optimal_view(
     optimal_views[guid_to_nodes[guid]] = view;
   }
   assert(dez.get_remaining_bytes() == 0);
-  printf("Deserialized Views...\n");
+  log_graph.debug("Deserialized Views...\n");
   for (auto const &it : optimal_views) {
-    printf("node[%zu]: type(%s) view(%d %d %d) ",
-           it.first.guid,
-           it.first.to_string().c_str(),
-           it.second.ndims,
-           it.second.dim[0],
-           it.second.start_device_id);
+    log_graph.debug("node[%zu]: type(%s) view(%d %d %d) ",
+                    it.first.guid,
+                    it.first.to_string().c_str(),
+                    it.second.ndims,
+                    it.second.dim[0],
+                    it.second.start_device_id);
     auto const &list = graph->inEdges.at(it.first);
     for (auto const &it2 : list) {
       Edge e = it2;
-      printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
+      log_graph.debug(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
     }
-    printf("\n");
+    log_graph.debug("\n");
   }
 }
 
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 212d0ebf6b..cc967b0cfe 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -217,7 +217,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
           }
         }
         if (!found_parallel_tensor) {
-          log_offload.print(
+          log_offload.debug(
               "Cannot find a previous tensor for operator(%d) output_idx(%d)",
               op_idx,
               i);
@@ -308,34 +308,35 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
       continue;
     }
-    printf("operator[%zu]: type(%s) guid(%lu)\n",
-           i,
-           get_operator_type_name(model->operators[i]->op_type).c_str(),
-           model->operators[i]->op_guid);
+    log_inf_mgr.debug(
+        "operator[%zu]: type(%s) guid(%lu)\n",
+        i,
+        get_operator_type_name(model->operators[i]->op_type).c_str(),
+        model->operators[i]->op_guid);
     for (int j = 0; j < op->numInputs; j++) {
       assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
       LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
-      printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\tinputs[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
     for (int j = 0; j < op->numOutputs; j++) {
       LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
-      printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\toutputs[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
     for (int j = 0; j < op->numWeights; j++) {
       LogicalRegion handle = op->weights[j]->region;
-      printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\tweights[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
   }
 }
@@ -665,7 +666,7 @@ void FFModel::compile_inference() {
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2b6994c7b2..b28d3d7701 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3478,7 +3478,7 @@ void FFModel::compile(LossType loss_type,
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor

From 8d89acdeee834e4b70caed7505937c5dbc07a121 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 26 May 2024 22:18:03 +0000
Subject: [PATCH 192/198] fix

---
 inference/python/spec_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index e8ef68b240..39529abda3 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -79,7 +79,7 @@ def get_configs():
                     "full_precision": False,
                 }
             ],
-            # "prompt": "",
+            "prompt": "",
             "output_file": "",
         }
         # Merge dictionaries

From 33c0fefc9c3b2dbbe4e29158e249a29d9747812c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 29 May 2024 08:02:52 +0000
Subject: [PATCH 193/198] fix

---
 python/flexflow/serve/models/starcoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 0cbb6d976c..2d4471201f 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -225,7 +225,7 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace("transformer", "")
+            name = name.replace("transformer.h", "layers").replace("transformer.", "")
             if "attn.c_attn.weight" in name:
                 name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
                 name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")

From 6727d3a8fde6d98aca1c657373f0c311e617c78d Mon Sep 17 00:00:00 2001
From: Remi Delacourt <remi.delacourt@gmail.com>
Date: Tue, 11 Jun 2024 20:48:13 +0000
Subject: [PATCH 194/198] Add peft demo

---
 inference/python/peft_demo/demo.py          | 117 ++++++++++++++++++++
 inference/python/peft_demo/demo_config.json |  29 +++++
 2 files changed, 146 insertions(+)
 create mode 100644 inference/python/peft_demo/demo.py
 create mode 100644 inference/python/peft_demo/demo_config.json

diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
new file mode 100644
index 0000000000..651d281241
--- /dev/null
+++ b/inference/python/peft_demo/demo.py
@@ -0,0 +1,117 @@
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+from datasets import load_dataset
+import random
+
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs.",
+        type=str,
+        default="",
+        required=True,
+    )
+    args = parser.parse_args()
+
+    # Load configs from JSON file
+    if not os.path.isfile(args.config_file):
+        raise FileNotFoundError(f"Config file {args.config_file} not found.")
+    try:
+        with open(args.config_file) as f:
+            return json.load(f)
+    except json.JSONDecodeError as e:
+        print("JSON format error:")
+        print(e)
+
+def init_llm_co_serving(configs_dict, configs):
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+
+    # Add the different PEFT models to finetune
+    for peft_model_id in configs.peft_model_ids:
+        llm.add_peft(peft_model_id)
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        enable_peft_finetuning = (len(configs.finetuning_dataset) > 0),
+        max_requests_per_batch=1,
+        max_seq_length=256,
+        max_tokens_per_batch=64,
+    )
+
+# Data comes from https://huggingface.co/datasets/databricks/databricks-dolly-15k
+def import_dataset():
+    inference_percentage = 0.6
+    dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
+    data = []
+    for i,row in enumerate(dataset):
+        if len(row['context']) == 0:
+            data.append((row['instruction'],row['response']))
+    inference_prompts = []
+    finetuning_prompts = []
+    for d in data:
+        if random.random() <= inference_percentage:
+            inference_prompts.append(d[0])
+        else:
+            finetuning_prompts.append(d)
+    return inference_prompts, finetuning_prompts
+
+    
+if __name__ == "__main__":
+    print("Co-Serving Demo")
+    # Import config parameters
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+    random.seed(configs.seed)
+    # Import inference dataset
+    # Import finetuning dataset
+    inference_prompts, finetuning_prompts = import_dataset()
+    # Initialize Llama2 lora model
+    llm = init_llm_co_serving(configs_dict, configs)
+    llm.start_server()
+    requests = []
+    # Prepare inference requests
+    inference_requests = [
+        ff.Request(
+            ff.RequestType.REQ_INFERENCE, 
+            prompt=prompt, 
+            max_sequence_length=configs.max_sequence_length
+        )
+        for prompt in inference_prompts
+    ]
+    requests += inference_requests
+    # Prepare finetuning requests
+    for peft_model_id in configs.peft_model_ids:
+        finetuning_request = ff.Request(
+            ff.RequestType.REQ_FINETUNING,
+            max_sequence_length=configs.max_sequence_length,
+            peft_model_id=llm.get_ff_peft_id(peft_model_id),
+            dataset=finetuning_prompts,
+        )
+        requests.append(finetuning_request)
+    # Jointly serve inference and finetuning requests
+    llm.generate(requests, max_length=configs.max_sequence_length)
+    llm.stop_server()
+    # Show statistics and metrics of the system
+    ## Show difference in loss on test dataset with finetuned and non-finetuned to prove that it works
+    ## Show compute resources utilized + other metrics
+    ## Compare with compute resources utilized without co-serving
\ No newline at end of file
diff --git a/inference/python/peft_demo/demo_config.json b/inference/python/peft_demo/demo_config.json
new file mode 100644
index 0000000000..aca759e681
--- /dev/null
+++ b/inference/python/peft_demo/demo_config.json
@@ -0,0 +1,29 @@
+{
+    "seed": 42,
+    "num_gpus": 4,
+    "memory_per_gpu": 14000,
+    "zero_copy_memory_per_node": 40000,
+    "num_cpus": 4,
+    "legion_utility_processors": 4,
+    "data_parallelism_degree": 1,
+    "tensor_parallelism_degree": 1,
+    "pipeline_parallelism_degree": 4,
+    "offload": false,
+    "offload_reserve_space_size": 8192,
+    "use_4bit_quantization": false,
+    "use_8bit_quantization": false,
+    "enable_peft": true,
+    "peft_activation_reserve_space_size": 1024,
+    "peft_weight_reserve_space_size": 1024,
+    "profiling": false,
+    "benchmarking": false,
+    "inference_debugging": false,
+    "fusion": true,
+    "base_model": "meta-llama/Llama-2-7b-hf",
+    "peft_model_ids": ["goliaro/llama-2-7b-lora-full"],
+    "max_sequence_length": 128,
+    "cache_path": "",
+    "refresh_cache": false,
+    "full_precision": true,
+    "output_file": "../output/peft.txt"
+}
\ No newline at end of file

From 6d7c245c51037c430328c45b9d9485e0c12f1e6a Mon Sep 17 00:00:00 2001
From: Remi Delacourt <remi.delacourt@gmail.com>
Date: Tue, 11 Jun 2024 21:05:45 +0000
Subject: [PATCH 195/198] Add readme for demo

---
 inference/python/peft_demo/INSTRUCTIONS.md  | 25 +++++++++++++++++++++
 inference/python/peft_demo/demo_config.json |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 inference/python/peft_demo/INSTRUCTIONS.md

diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md
new file mode 100644
index 0000000000..9b2a7a53b2
--- /dev/null
+++ b/inference/python/peft_demo/INSTRUCTIONS.md
@@ -0,0 +1,25 @@
+## Peft Demo
+* `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git`
+* `cd FlexFlow/`
+
+* If you wish to run the demo by installing FlexFlow
+    * `conda env create -f conda/flexflow.yml`
+    * `conda activate flexflow`
+
+* If you wish to run the demo using a Docker container
+    * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow`
+
+* Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token)
+
+    * `export HUGGINGFACE_TOKEN="[Your token]"`
+    * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"`
+    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"`
+
+* Run the demo
+    ```
+    mkdir inference/output
+    cd inference/python/peft_demo/
+    python3 demo.py -config-file demo_config.json
+    ```
+
+
diff --git a/inference/python/peft_demo/demo_config.json b/inference/python/peft_demo/demo_config.json
index aca759e681..fa8f577e04 100644
--- a/inference/python/peft_demo/demo_config.json
+++ b/inference/python/peft_demo/demo_config.json
@@ -25,5 +25,5 @@
     "cache_path": "",
     "refresh_cache": false,
     "full_precision": true,
-    "output_file": "../output/peft.txt"
+    "output_file": "../../output/peft_demo.txt"
 }
\ No newline at end of file

From 511fd649da6ec3587d11da12cf0389b746dfd569 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 20 Jun 2024 02:06:31 +0000
Subject: [PATCH 196/198] fix alignment issue

---
 include/flexflow/config.h                    |  27 +++--
 include/flexflow/utils/cuda_helper.h         |   1 +
 inference/MODEL_WEIGHTS.md                   |  28 -----
 inference/README.md                          |  27 +++++
 src/c/flexflow_c.cc                          |   2 +-
 src/ops/inc_multihead_self_attention.cu      |   9 +-
 src/ops/spec_inc_multihead_self_attention.cu |  43 ++------
 src/ops/tree_inc_multihead_self_attention.cu |  20 ++--
 src/runtime/cuda_helper.cu                   |  17 +++
 src/runtime/model.cpp                        |   4 +-
 src/runtime/model.cu                         |   4 +-
 src/runtime/request_manager.cpp              |  45 ++++----
 src/runtime/request_manager.cu               | 108 ++++++++-----------
 13 files changed, 151 insertions(+), 184 deletions(-)
 delete mode 100644 inference/MODEL_WEIGHTS.md
 create mode 100644 inference/README.md

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 3cf985f279..dd9d657117 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -68,6 +68,23 @@ class FFConfig;
 class MemoryAllocator;
 class PEFTWeightAllocator;
 
+struct CombinedBatchConfigMetaStruct {
+  BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS];
+  BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS];
+  BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS];
+  bool request_completed[BatchConfig::MAX_NUM_REQUESTS];
+
+  BeamSearchBatchConfig::BeamSearchPerTokenInfo
+      beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS +
+                    BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM *
+                        BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+  BeamSearchBatchConfig::BeamSearchPerRequestInfo
+      beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+
+  TreeVerifyBatchConfig::CommittedTokensInfo
+      committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS];
+};
+
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;
@@ -78,16 +95,10 @@ struct FFHandler {
 #endif
   void *workSpace;
   size_t workSpaceSize;
-  void *batch_config_metadata;
+  CombinedBatchConfigMetaStruct *batch_config_metadata;
 
   // request info + token info + topolopgy mask info
-  size_t batch_config_metadata_size =
-      sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-      sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-      sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_completed);
+  size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   // PEFT related fields
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index caaa54683a..486a65eb3d 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -183,3 +183,4 @@ cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type);
 cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type);
 #endif
 void check_device_vs_host_ptr(void const *maybe_devicePtr);
+void check_ptr_alignment(void const *ptr);
diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md
deleted file mode 100644
index d78fb37be9..0000000000
--- a/inference/MODEL_WEIGHTS.md
+++ /dev/null
@@ -1,28 +0,0 @@
-To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-for name, params in model.named_parameters():
-    for name, params in model.named_parameters():
-    name = (
-        name.replace(".", "_")
-        .replace("self_attn", "attention")
-        .replace("q_proj", "wq")
-        .replace("k_proj", "wk")
-        .replace("v_proj", "wv")
-        .replace("o_proj", "wo")
-        .replace("mlp", "feed_forward")
-        .replace("gate_proj", "w1")
-        .replace("down_proj", "w2")
-        .replace("up_proj", "w3")
-        .replace("input_layernorm", "attention_norm")
-        .replace("post_attention_layernorm", "ffn_norm")
-        .replace("embed_tokens", "tok_embeddings")
-        .replace("lm_head", "output")
-        .replace("model_", "")
-    )
-    params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name)
-```
-
diff --git a/inference/README.md b/inference/README.md
new file mode 100644
index 0000000000..7ddf118715
--- /dev/null
+++ b/inference/README.md
@@ -0,0 +1,27 @@
+# Inference Examples
+This folder contains the code to run inference examples in FlexFlow
+
+To create a sample prompt, call (from the `build` folder):
+
+```bash
+mkdir -p ../inference/prompt
+echo '["San Francisco is a "]' > ../inference/prompt/test.json
+```
+
+To download a model for use in C++, call:
+```bash
+huggingface-cli login # if needed
+python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only
+```
+
+To run the incremental decoding example in C++, call:
+
+```bash
+./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4
+```
+
+To run the speculative inference example in C++, call:
+
+```bash
+./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4
+```
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 43fcd55a02..76ca5053d6 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2846,7 +2846,7 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
 }
 
 flexflow_peft_model_id_t flexflow_peft_model_id_no_id() {
-  PEFTModelID *handle = const_cast<PEFTModelID*>(&PEFTModelID::NO_ID);
+  PEFTModelID *handle = const_cast<PEFTModelID *>(&PEFTModelID::NO_ID);
   DEBUG_PRINT("[PEFTModelID] new %p", handle);
   return FFCObjectWrapper::wrap(handle);
 }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index d1b93cb206..aa98dc4964 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -2088,11 +2088,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
                                                              size_of_dt);
 
-    token_infos =
-        static_cast<BatchConfig::PerTokenInfo *>(handler.batch_config_metadata);
-    request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo));
+    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
+        handler.batch_config_metadata->tokens_info);
+    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
+        handler.batch_config_metadata->requestsInfo);
 
     if (offload) {
       // token_infos =
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 6c3ef9895b..4688a8233c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -542,20 +542,9 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     DT const *A = static_cast<DT *>(m->devQKVProjArray) +
                   bc->requestsInfo[i].first_token_offset_in_batch *
                       m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-
-    // print_tensor<float>((float*)A, 32, "A");
     DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+    DT *C = static_cast<DT *>(m->qk_prods);
 
-    // if (i == 0 && sub_req_id == 0 &&
-    //     bc->beam_slots.at(0).current_depth == 1) {
-    //   int offset = (float *)B - m->keyCache;
-    //   printf("key cache offset %d\n", kt_req_block_size);
-    // }
-    // To get C, skip over QK^T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods) +
-            m->num_q_heads * tokens_prev_requests_squares;
     checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                          CUBLAS_OP_T,
                                          CUBLAS_OP_N,
@@ -855,29 +844,15 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
     beam_token_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
-
+        static_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
+            handler.batch_config_metadata->beamTokenInfo);
     beam_request_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BeamSearchBatchConfig::beamTokenInfo));
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-        sizeof(BatchConfig::causalMask));
+        static_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
+            handler.batch_config_metadata->beamRequestsInfo);
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 909b34aa5f..02f39192df 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -1061,21 +1061,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
 
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
     committed_token_infos =
-        reinterpret_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BatchConfig::causalMask));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::causalMask) +
-        sizeof(TreeVerifyBatchConfig::committed_tokens));
+        static_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+            handler.batch_config_metadata->committed_tokens);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 56294c5e35..880a570b0c 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -646,6 +646,23 @@ void check_device_vs_host_ptr(void const *maybe_devicePtr) {
   }
 }
 
+void check_ptr_alignment(void const *ptr) {
+  if (!ptr) {
+    printf("Pointer is NULL\n");
+    return;
+  }
+  bool aligned2 = ((uintptr_t)ptr % 2 == 0);
+  bool aligned4 = ((uintptr_t)ptr % 4 == 0);
+  bool aligned8 = ((uintptr_t)ptr % 8 == 0);
+  bool aligned16 = ((uintptr_t)ptr % 16 == 0);
+  printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n",
+         ptr,
+         (aligned2 ? "yes" : "no"),
+         (aligned4 ? "yes" : "no"),
+         (aligned8 ? "yes" : "no"),
+         (aligned16 ? "yes" : "no"));
+}
+
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
 template __global__ void
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index ad2b781567..0a8253dd2f 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -174,8 +174,8 @@ FFHandler
                                            0,
                                            Realm::ProfilingRequestSet())
         .wait();
-    handle.batch_config_metadata =
-        workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata = static_cast<CombinedBatchConfigMetaStruct *>(
+        workspaceInst.pointer_untyped(0, sizeof(char)));
   } else {
     handle.batch_config_metadata = nullptr;
   }
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 5e07ae0894..56b1e2a6a5 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -172,8 +172,8 @@ FFHandler
                                            0,
                                            Realm::ProfilingRequestSet())
         .wait();
-    handle.batch_config_metadata =
-        workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata = static_cast<CombinedBatchConfigMetaStruct *>(
+        workspaceInst.pointer_untyped(0, sizeof(char)));
   } else {
     handle.batch_config_metadata = nullptr;
   }
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index fadbf80d6d..8e5f302466 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -73,74 +73,69 @@ void RequestManager::load_batch_config_task(
 
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
-  size_t total_copy_size = 0;
-  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata,
+  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->tokens_info,
                            &(batch_config->tokensInfo),
                            sizeof(BatchConfig::tokensInfo),
                            hipMemcpyHostToDevice,
                            stream));
-  total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                               total_copy_size,
+  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->requestsInfo,
                            &(batch_config->requestsInfo),
                            sizeof(BatchConfig::requestsInfo),
                            hipMemcpyHostToDevice,
                            stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     BeamSearchBatchConfig const *beam_batch_config =
         static_cast<BeamSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamTokenInfo,
                              &(beam_batch_config->beamTokenInfo),
                              sizeof(BeamSearchBatchConfig::beamTokenInfo),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo,
                              &(beam_batch_config->beamRequestsInfo),
                              sizeof(BeamSearchBatchConfig::beamRequestsInfo),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask,
                              &(beam_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BatchConfig::causalMask);
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed,
+                             &(batch_config->request_completed),
+                             sizeof(BatchConfig::request_completed),
+                             hipMemcpyHostToDevice,
+                             stream));
+
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask,
                              &(tree_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->committed_tokens,
                              &(tree_batch_config->committed_tokens),
                              sizeof(TreeVerifyBatchConfig::committed_tokens),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-  }
 
-  // add a size check
-  assert(total_copy_size <= handle.batch_config_metadata_size);
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed,
+                             &(batch_config->request_completed),
+                             sizeof(BatchConfig::request_completed),
+                             hipMemcpyHostToDevice,
+                             stream));
+  }
 }
 
 void RequestManager::load_positions_task(
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 235d435580..343f1dd6e6 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -93,91 +93,69 @@ void RequestManager::load_batch_config_task(
 
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
-  size_t total_copy_size = 0;
-  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
+  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->tokens_info,
                             &(batch_config->tokensInfo),
                             sizeof(BatchConfig::tokensInfo),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                total_copy_size,
+  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->requestsInfo,
                             &(batch_config->requestsInfo),
                             sizeof(BatchConfig::requestsInfo),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     BeamSearchBatchConfig const *beam_batch_config =
         static_cast<BeamSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamTokenInfo),
-        sizeof(BeamSearchBatchConfig::beamTokenInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamRequestsInfo),
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamTokenInfo,
+                              &(beam_batch_config->beamTokenInfo),
+                              sizeof(BeamSearchBatchConfig::beamTokenInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo,
+                              &(beam_batch_config->beamRequestsInfo),
+                              sizeof(BeamSearchBatchConfig::beamRequestsInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask,
+                              &(beam_batch_config->causalMask),
+                              sizeof(BatchConfig::causalMask),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed,
+                              &(batch_config->request_completed),
+                              sizeof(BatchConfig::request_completed),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->committed_tokens),
-        sizeof(TreeVerifyBatchConfig::committed_tokens),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask,
+                              &(tree_batch_config->causalMask),
+                              sizeof(BatchConfig::causalMask),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->committed_tokens,
+                              &(tree_batch_config->committed_tokens),
+                              sizeof(TreeVerifyBatchConfig::committed_tokens),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed,
+                              &(batch_config->request_completed),
+                              sizeof(BatchConfig::request_completed),
+                              cudaMemcpyHostToDevice,
+                              stream));
   }
-
-  // add a size check
-  assert(total_copy_size <= handle.batch_config_metadata_size);
 }
 
 void RequestManager::load_positions_task(

From 2899ba29e92ec42787a45adcd0e798a7a89d4ab0 Mon Sep 17 00:00:00 2001
From: root <yingchen21@mails.tsinghua.edu.cn>
Date: Tue, 9 Jul 2024 01:52:30 +0000
Subject: [PATCH 197/198] Initial implemention of disaggregated attention and
 qkv projection

---
 .../inc_multihead_self_attention_kernels.h    |  10 +
 inference/models/llama.cc                     |  35 ++-
 src/ops/inc_multihead_self_attention.cc       |   2 +-
 src/ops/inc_multihead_self_attention.cpp      | 187 +++++++------
 src/ops/inc_multihead_self_attention.cu       | 163 +++++++++++-
 src/ops/spec_inc_multihead_self_attention.cpp |  89 ++++---
 src/ops/spec_inc_multihead_self_attention.cu  |  23 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  89 ++++---
 src/ops/tree_inc_multihead_self_attention.cu  |  36 ++-
 src/runtime/file_loader.cc                    | 248 ++++++++++++++++++
 10 files changed, 707 insertions(+), 175 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9bf2f581e2..552d5e3496 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -100,6 +100,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         DT const *bias_ptr,
                         ffStream_t stream);
 
+template <typename DT>
+void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        int shard_id,
+                        // DT const *input_ptr,
+                        DT const *weight_ptr,
+                        DT *output_ptr,
+                        DT const *bias_ptr,
+                        ffStream_t stream);
+
 template <typename DT>
 void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 4be232e81b..e4b2e5a537 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -92,11 +92,26 @@ void LLAMA::create_llama_model(FFModel &ff,
       att_norm = token_att_norm[1];
     }
 
+    Tensor qkv_proj = ff.dense(
+      att_norm,
+      llama_config.hidden_size * 3, // q, k, v. need to change if want to remove replication. (q_heads + 2 * kv_heads) * proj_size
+      AC_MODE_NONE,
+      false, // seems like llama does not use bias
+      DT_NONE, // what is this
+      nullptr, // ?
+      nullptr, // ?
+      nullptr, // ?
+      REG_MODE_NONE, // no regularization
+      0.0f, // no dropout
+      std::string("layers_" + std::to_string(i) + "_attn_qkv_proj")
+                     .c_str()
+    );
+
     Tensor mha;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
         mha = ff.spec_inc_multihead_self_attention(
-            att_norm,
+            qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
@@ -119,7 +134,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       }
       case TREE_VERIFY_MODE: {
         mha = ff.inc_multihead_self_attention_verify(
-            att_norm,
+            qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
@@ -142,7 +157,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       }
       case INC_DECODING_MODE: {
         mha = ff.inc_multihead_self_attention(
-            att_norm,
+            qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
@@ -168,6 +183,20 @@ void LLAMA::create_llama_model(FFModel &ff,
       }
     }
 
+    Tensor mha_input = mha;
+    mha = ff.dense(mha_input,
+                   llama_config.hidden_size,
+                   AC_MODE_NONE,
+                   false,
+                   DT_NONE,
+                   nullptr,
+                   nullptr,
+                   nullptr,
+                   REG_MODE_NONE,
+                   0.0f,
+                   std::string("layers_" + std::to_string(i) + "_attn_o_proj")
+                       .c_str());
+
     // step 2: SILU activaion
     Tensor token_ff_norm[2] = {nullptr, nullptr};
     ff.residual_rms_norm(
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 5d52034575..1d5528f759 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -142,7 +142,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
     for (int i = 0; i < numdims; i++) {
       dims[i] = input->dims[i];
     }
-    dims[0] = embed_dim;
+    dims[0] = vdim * num_kv_heads; // we now output o_proj_dim * o_heads
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d38f93558e..1fd2564013 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -246,7 +246,7 @@ template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         int shard_id,
-                        DT const *input_ptr,
+                        // DT const *input_ptr,
                         DT const *weight_ptr,
                         DT *output_ptr,
                         DT const *bias_ptr,
@@ -277,25 +277,26 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   int k = m->qSize;
   int m_ = m_q * QKV_WEIGHT_NUM;
   int lda = k, ldb = k, ldc = m_;
-  checkCUDA(hipblasGemmEx(m->handle.blas,
-                          HIPBLAS_OP_T,
-                          HIPBLAS_OP_N,
-                          m_,
-                          n,
-                          k,
-                          &alpha,
-                          weight_ptr,
-                          hipblas_data_type,
-                          lda,
-                          input_ptr,
-                          hipblas_data_type,
-                          ldb,
-                          &beta,
-                          output_ptr,
-                          hipblas_data_type,
-                          ldc,
-                          compute_type,
-                          HIPBLAS_GEMM_DEFAULT));
+  // this projection is done in dense layer, no need for gemm here
+  // checkCUDA(hipblasGemmEx(m->handle.blas,
+  //                         HIPBLAS_OP_T,
+  //                         HIPBLAS_OP_N,
+  //                         m_,
+  //                         n,
+  //                         k,
+  //                         &alpha,
+  //                         weight_ptr,
+  //                         hipblas_data_type,
+  //                         lda,
+  //                         input_ptr,
+  //                         hipblas_data_type,
+  //                         ldb,
+  //                         &beta,
+  //                         output_ptr,
+  //                         hipblas_data_type,
+  //                         ldc,
+  //                         compute_type,
+  //                         HIPBLAS_GEMM_DEFAULT));
 
   // apply rotary emmmbedding for q and k
   // step1 change the k, v to complex tensor
@@ -303,25 +304,38 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
   // apply bias for q, k, v
-  if (*m->qkv_bias) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       shard_id,
-                       num_tokens,
-                       m->qProjSize,
-                       m->kProjSize,
-                       m->vProjSize,
-                       m->global_num_q_heads,
-                       m->num_q_heads,
-                       *m->scaling_query,
-                       m->scaling_factor,
-                       m->hidden_size);
-  } else if (m->scaling_query) {
+  // if (*m->qkv_bias) {
+  //   hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv<DT>),
+  //                      GET_BLOCKS(parallelism),
+  //                      min(CUDA_NUM_THREADS, parallelism),
+  //                      0,
+  //                      stream,
+  //                      output_ptr,
+  //                      bias_ptr,
+  //                      shard_id,
+  //                      num_tokens,
+  //                      m->qProjSize,
+  //                      m->kProjSize,
+  //                      m->vProjSize,
+  //                      m->global_num_q_heads,
+  //                      m->num_q_heads,
+  //                      *m->scaling_query,
+  //                      m->scaling_factor,
+  //                      m->hidden_size);
+  // } else if (m->scaling_query) {
+  //   hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel<DT>),
+  //                      GET_BLOCKS(parallelism),
+  //                      min(CUDA_NUM_THREADS, parallelism),
+  //                      0,
+  //                      stream,
+  //                      output_ptr,
+  //                      num_tokens,
+  //                      m->num_q_heads,
+  //                      m->qProjSize,
+  //                      m->scaling_factor,
+  //                      m->hidden_size);
+  // }
+  if (m->scaling_query) {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
@@ -439,7 +453,7 @@ template <typename DT>
 void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       BatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
+                      DT const *qkv_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
@@ -457,11 +471,22 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                sizeof(BatchConfig::PerTokenInfo),
                            hipMemcpyHostToDevice,
                            stream));
+
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     input_ptr,
+                    //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
@@ -703,47 +728,51 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                  m->kProjSize * m->num_q_heads +
                                  m->vProjSize * m->num_q_heads);
     B = C;
-    C = static_cast<DT *>(output_ptr) + tokens_previous_requests * m->oProjSize;
-
-    checkCUDA(hipblasGemmEx(m->handle.blas,
-                            HIPBLAS_OP_T,
-                            HIPBLAS_OP_T,
-                            m_,
-                            n,
-                            k,
-                            &alpha,
-                            A,
-                            hipblas_data_type,
-                            lda,
-                            B,
-                            hipblas_data_type,
-                            ldb,
-                            &beta,
-                            C,
-                            hipblas_data_type,
-                            ldc,
-                            compute_type,
-                            HIPBLAS_GEMM_DEFAULT));
+    C = static_cast<DT *>(output_ptr) + tokens_previous_requests * m->oProjSize; // what is the shape here?
+
+    // checkCUDA(hipblasGemmEx(m->handle.blas,
+    //                         HIPBLAS_OP_T,
+    //                         HIPBLAS_OP_T,
+    //                         m_,
+    //                         n,
+    //                         k,
+    //                         &alpha,
+    //                         A,
+    //                         hipblas_data_type,
+    //                         lda,
+    //                         B,
+    //                         hipblas_data_type,
+    //                         ldb,
+    //                         &beta,
+    //                         C,
+    //                         hipblas_data_type,
+    //                         ldc,
+    //                         compute_type,
+    //                         HIPBLAS_GEMM_DEFAULT));
     tokens_previous_requests += num_new_tokens;
   }
 
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       num_tokens,
-                       qkv_weight_size,
-                       m->oProjSize);
-  }
-
+  // if (*m->final_bias && shard_id == 0) {
+  //   int parallelism = m->oProjSize * num_tokens;
+  //   int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+  //                         m->kProjSize * m->global_num_q_heads +
+  //                         m->vProjSize * m->global_num_q_heads;
+  //   hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
+  //                      GET_BLOCKS(parallelism),
+  //                      min(CUDA_NUM_THREADS, parallelism),
+  //                      0,
+  //                      stream,
+  //                      output_ptr,
+  //                      bias_ptr,
+  //                      num_tokens,
+  //                      qkv_weight_size,
+  //                      m->oProjSize);
+  // }
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
   assert(tokens_previous_requests == num_tokens);
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index aa98dc4964..8334dc0636 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -640,6 +640,139 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                      m->hidden_size);
   }
 
+
+  // Step 3: apply rotary embedding if needed
+  if (*m->apply_rotary_embedding) {
+    /*q&k*/
+    parallelism = num_tokens * m->hidden_size;
+    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
+                                min(CUDA_NUM_THREADS, parallelism),
+                                0,
+                                stream>>>(output_ptr,
+                                          m->complex_input,
+                                          m->token_infos,
+                                          m->qProjSize,
+                                          m->kProjSize,
+                                          num_tokens,
+                                          q_array_size,
+                                          m->hidden_size);
+  }
+}
+
+template <typename DT>
+void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        int shard_id,
+                        // DT const *input_ptr, we no longer use the raw input
+                        DT const *weight_ptr,
+                        DT *output_ptr,
+                        DT const *bias_ptr,
+                        cudaStream_t stream) {
+
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  assert(m->qSize == m->vSize && m->qSize == m->kSize);
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
+#endif
+
+  // this block is deleted so that dense operator are done in  model,
+  // which allows for peft on qkv projection
+  // // Step 1: Compute QKV projections
+  // {
+  //   DT alpha = 1.0f, beta = 0.0f;
+  //   // after transpositions
+  //   int m_q = m->qProjSize * m->num_q_heads;
+  //   int m_k = m->kProjSize * m->num_q_heads;
+  //   int m_v = m->vProjSize * m->num_q_heads;
+  //   assert(m_q == m_k && m_k == m_v); // keep things simple for now
+  //   int n = bc->num_active_tokens();
+  //   int k = m->qSize;
+  //   int m_ = m_q * QKV_WEIGHT_NUM;
+  //   // before transpositions
+  //   int lda = k, ldb = k, ldc = m_;
+  //   // matrix A: QKV weights
+  //   // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
+  //   // matrix B: input
+  //   // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
+  //   // matrix C: devQKVProjArray
+  //   // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
+  //   checkCUDA(cublasGemmEx(m->handle.blas,
+  //                          CUBLAS_OP_T,
+  //                          CUBLAS_OP_N,
+  //                          m_,
+  //                          n,
+  //                          k,
+  //                          &alpha,
+  //                          weight_ptr,
+  //                          cublas_data_type,
+  //                          lda,
+  //                          input_ptr,
+  //                          cublas_data_type,
+  //                          ldb,
+  //                          &beta,
+  //                          output_ptr,
+  //                          cublas_data_type,
+  //                          ldc,
+  //                          compute_type,
+  //                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  // }
+
+  int num_tokens = bc->num_active_tokens();
+  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
+  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
+
+  // Step 2: apply bias for QKV, or scale the query
+  // this are handled in the dense layer with bias, but we still need to handle scaling
+  // if (*m->qkv_bias) {
+  //   apply_proj_bias_qkv<<<GET_BLOCKS(parallelism),
+  //                         min(CUDA_NUM_THREADS, parallelism),
+  //                         0,
+  //                         stream>>>(output_ptr,
+  //                                   bias_ptr,
+  //                                   shard_id,
+  //                                   num_tokens,
+  //                                   m->qProjSize,
+  //                                   m->kProjSize,
+  //                                   m->vProjSize,
+  //                                   m->global_num_q_heads,
+  //                                   m->num_q_heads,
+  //                                   *m->scaling_query,
+  //                                   m->scaling_factor,
+  //                                   m->hidden_size);
+  // } else if (m->scaling_query) {
+  //   scaling_query_kernel<<<GET_BLOCKS(parallelism),
+  //                          min(CUDA_NUM_THREADS, parallelism),
+  //                          0,
+  //                          stream>>>(output_ptr,
+  //                                    num_tokens,
+  //                                    m->num_q_heads,
+  //                                    m->qProjSize,
+  //                                    m->scaling_factor,
+  //                                    m->hidden_size);
+  // }
+
+  if (m->scaling_query) {
+    scaling_query_kernel<<<GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream>>>(output_ptr,
+                                     num_tokens,
+                                     m->num_q_heads,
+                                     m->qProjSize,
+                                     m->scaling_factor,
+                                     m->hidden_size);
+  }
+
   // Step 3: apply rotary embedding if needed
   if (*m->apply_rotary_embedding) {
     /*q&k*/
@@ -860,7 +993,7 @@ template <typename DT>
 void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
+                      DT const *qkv_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
@@ -872,11 +1005,23 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
 
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
+  
+
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     input_ptr,
+                    //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
@@ -897,8 +1042,18 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
 
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
-  compute_o_prod_bias(
-      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+
+  // this dense layer (with bias) is done in the model by seperate dense layer
+  // compute_o_prod_bias(
+  //     m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+
+  // simply copy the result to output_ptr
+  // TODO: change the meta for output, maybe transpose here?
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 }
 
 std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index aebd5e8892..5c5cade645 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -414,45 +414,50 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       C = static_cast<DT *>(output_ptr) +
           tokens_previous_requests * m->oProjSize;
 
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_T,
-                              HIPBLAS_OP_T,
-                              m_,
-                              n,
-                              k,
-                              &alpha,
-                              A,
-                              hipblas_data_type,
-                              lda,
-                              B,
-                              hipblas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              hipblas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
+      // checkCUDA(hipblasGemmEx(m->handle.blas,
+      //                         HIPBLAS_OP_T,
+      //                         HIPBLAS_OP_T,
+      //                         m_,
+      //                         n,
+      //                         k,
+      //                         &alpha,
+      //                         A,
+      //                         hipblas_data_type,
+      //                         lda,
+      //                         B,
+      //                         hipblas_data_type,
+      //                         ldb,
+      //                         &beta,
+      //                         C,
+      //                         hipblas_data_type,
+      //                         ldc,
+      //                         compute_type,
+      //                         HIPBLAS_GEMM_DEFAULT));
       tokens_previous_requests += num_new_tokens;
       tokens_prev_requests_squares += num_new_tokens * total_tokens;
     }
   }
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       num_tokens,
-                       qkv_weight_size,
-                       m->oProjSize);
-  }
+  // if (*m->final_bias && shard_id == 0) {
+  //   int parallelism = m->oProjSize * num_tokens;
+  //   int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+  //                         m->kProjSize * m->global_num_q_heads +
+  //                         m->vProjSize * m->global_num_q_heads;
+  //   hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
+  //                      GET_BLOCKS(parallelism),
+  //                      min(CUDA_NUM_THREADS, parallelism),
+  //                      0,
+  //                      stream,
+  //                      output_ptr,
+  //                      bias_ptr,
+  //                      num_tokens,
+  //                      qkv_weight_size,
+  //                      m->oProjSize);
+  // }
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 
   assert(tokens_previous_requests == num_tokens);
 }
@@ -461,7 +466,7 @@ template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       BeamSearchBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
+                      DT const *qkv_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
@@ -494,11 +499,21 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
           sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo),
       hipMemcpyHostToDevice,
       stream));
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     input_ptr,
+                    //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 4688a8233c..ce8055286e 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -698,17 +698,27 @@ template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       BeamSearchBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
+                      DT const *qkv_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
+
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
   // phase 1: Implement kernel to compute KQV for input tokens
 
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     input_ptr,
+                    //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
@@ -728,8 +738,13 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
 
-  compute_o_prod_bias(
-      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+  // compute_o_prod_bias(
+  //     m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 03e0ac6441..24476063b2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -391,47 +391,52 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
       C = static_cast<DT *>(output_ptr) +
           processed_tokens_in_batch * m->oProjSize;
 
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_T,
-                              HIPBLAS_OP_T,
-                              m_,
-                              n,
-                              k,
-                              &alpha,
-                              A,
-                              hipblas_data_type,
-                              lda,
-                              B,
-                              hipblas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              hipblas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
+      // checkCUDA(hipblasGemmEx(m->handle.blas,
+      //                         HIPBLAS_OP_T,
+      //                         HIPBLAS_OP_T,
+      //                         m_,
+      //                         n,
+      //                         k,
+      //                         &alpha,
+      //                         A,
+      //                         hipblas_data_type,
+      //                         lda,
+      //                         B,
+      //                         hipblas_data_type,
+      //                         ldb,
+      //                         &beta,
+      //                         C,
+      //                         hipblas_data_type,
+      //                         ldc,
+      //                         compute_type,
+      //                         HIPBLAS_GEMM_DEFAULT));
       processed_tokens_in_batch += num_new_tokens;
     }
     // Before moving to the next request
     // check that we have finished all tokens of the request
     assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
   }
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * processed_tokens_in_batch;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       processed_tokens_in_batch,
-                       qkv_weight_size,
-                       m->oProjSize);
-  }
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * processed_tokens_in_batch * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+  // if (*m->final_bias && shard_id == 0) {
+  //   int parallelism = m->oProjSize * processed_tokens_in_batch;
+  //   int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+  //                         m->kProjSize * m->global_num_q_heads +
+  //                         m->vProjSize * m->global_num_q_heads;
+  //   hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
+  //                      GET_BLOCKS(parallelism),
+  //                      min(CUDA_NUM_THREADS, parallelism),
+  //                      0,
+  //                      stream,
+  //                      output_ptr,
+  //                      bias_ptr,
+  //                      processed_tokens_in_batch,
+  //                      qkv_weight_size,
+  //                      m->oProjSize);
+  // }
 
   assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
 }
@@ -440,7 +445,7 @@ template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       TreeVerifyBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
+                      DT const *qkv_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
@@ -490,11 +495,21 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                                sizeof(TreeVerifyBatchConfig::PerTokenInfo),
                            hipMemcpyHostToDevice,
                            stream));
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     input_ptr,
+                    //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 02f39192df..1dd5773ef4 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -874,7 +874,7 @@ template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       TreeVerifyBatchConfig const *bc,
                       int shard_id,
-                      DT const *input_ptr,
+                      DT const *qkv_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
@@ -915,11 +915,21 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
         m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
+  // phase 0: copy calculated qkv into devQKVProjArray
+  // [qProjSize, num_heads, 3, num_new_tokens]
+  size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
+
+  cudaMemcpyAsync(m->devQKVProjArray,
+                  qkv_ptr,
+                  qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     input_ptr,
+                    //  input_ptr,
                      weight_ptr,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
@@ -934,14 +944,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
-  compute_o_prod_bias(m,
-                      bc,
-                      shard_id,
-                      output_ptr,
-                      weight_ptr,
-                      bias_ptr,
-                      processed_tokens_in_batch,
-                      stream);
+  // compute_o_prod_bias(m,
+  //                     bc,
+  //                     shard_id,
+  //                     output_ptr,
+  //                     weight_ptr,
+  //                     bias_ptr,
+  //                     processed_tokens_in_batch,
+  //                     stream);
+  int num_tokens = bc->num_active_tokens();
+  cudaMemcpyAsync(output_ptr,
+                  m->attn_heads,
+                  m->oProjSize * num_tokens * sizeof(DT),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
 }
 
 } // namespace TreeIncMultiHeadAttention
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index c373e0da9b..a66a6097ea 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -127,6 +127,59 @@ void load_attention_weights_multi_query(DT *ptr,
   }
 }
 
+template <typename DT>
+void load_attention_o_proj_bias_to_dense_v2(DT *ptr,
+                            int num_heads,
+                            int num_kv_heads,
+                            size_t hidden_dim,
+                            size_t qkv_inner_dim,
+                            std::string layer_name,
+                            std::string weights_folder) {
+  std::string filename = layer_name + "_wo_bias";
+
+  int file_index = 0;
+
+  // now only opt use this.
+  // assert(num_heads == num_kv_heads);
+  int idx = 0;
+
+  std::cout << "Loading weight file " << filename << std::endl;
+  std::string weight_filepath = join_path({weights_folder, filename});
+
+  int n_heads = num_heads;
+
+  int replicate_num = num_heads / num_kv_heads;
+
+  size_t out_partial_size = hidden_dim;
+  size_t partial_size = out_partial_size;
+  std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+  assert(in.good() && "incorrect bias file path");
+  std::vector<DT> host_array(partial_size);
+  size_t loaded_data_size = sizeof(DT) * partial_size;
+  in.seekg(0, in.end);
+  in.seekg(0, in.beg);
+  in.read((char *)host_array.data(), loaded_data_size);
+  size_t in_get_size = in.gcount();
+
+  if (in_get_size != loaded_data_size) {
+    printf(
+        "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n",
+        in_get_size,
+        loaded_data_size);
+    assert(false);
+  }
+  assert(partial_size == host_array.size());
+
+  size_t data_index = 0;
+
+  for (int i = 0; i < partial_size; i++) {
+    ptr[i] = host_array.at(data_index);
+    data_index++;
+  }
+
+  in.close();
+}
+
 template <typename DT>
 void load_attention_bias_v2(DT *ptr,
                             int num_heads,
@@ -207,6 +260,140 @@ void load_attention_bias_v2(DT *ptr,
   }
 }
 
+template <typename DT>
+void load_attention_weights_to_dense_v2(DT *ptr,
+                               int num_heads,
+                               int num_kv_heads,
+                               size_t hidden_dim,
+                               size_t qkv_inner_dim,
+                               std::string layer_name,
+                               std::string weights_folder,
+                               size_t volume,
+                               int tensor_parallelism_degree,
+                               bool load_o_proj) {
+  // layers_0_attention_wq_weight
+  // layers_0_self_attn_q_proj_weight
+  std::string q_file = layer_name + "_wq_weight";
+  std::string k_file = layer_name + "_wk_weight";
+  std::string v_file = layer_name + "_wv_weight";
+  std::string o_file = layer_name + "_wo_weight";
+  std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
+  int file_index = 0;
+
+  int base_index = 0;
+  size_t single_proj_size =
+      hidden_dim *
+      qkv_inner_dim; // size of each of Q,K,V,O weights for a single head
+  size_t one_weight_file_size =
+      num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
+
+  size_t q_size = one_weight_file_size, o_size = one_weight_file_size;
+  size_t k_size = single_proj_size * num_kv_heads,
+         v_size = single_proj_size * num_kv_heads;
+
+  size_t k_replicate_size = one_weight_file_size;
+  size_t v_replicate_size = one_weight_file_size;
+
+  int replicate_num = num_heads / num_kv_heads;
+
+  // stride for q, k, v, o
+  size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) /
+                       tensor_parallelism_degree;
+  if(!load_o_proj) {
+    for (auto filename : weight_filenames) {
+      std::cout << "Loading weight file " << filename << std::endl;
+      std::string weight_filepath = join_path({weights_folder, filename});
+
+      int data_index = 0;
+      size_t partial_size = (file_index == 0 || file_index == 3)
+                                ? one_weight_file_size
+                                : single_proj_size * num_kv_heads;
+      size_t one_partition_size =
+          one_weight_file_size / tensor_parallelism_degree;
+
+      std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+      if (!in.good()) {
+        std::cout << "Could not open file: " << weight_filepath << std::endl;
+      }
+      assert(in.good() && "incorrect weight file path");
+      std::vector<DT> host_array(partial_size);
+      size_t loaded_data_size = sizeof(DT) * partial_size;
+      in.seekg(0, in.end);
+      in.seekg(0, in.beg);
+      in.read((char *)host_array.data(), loaded_data_size);
+      size_t in_get_size = in.gcount();
+
+      if (in_get_size != loaded_data_size) {
+        std::cout << "load attention data error " << in_get_size << ", "
+                  << loaded_data_size << ", " << file_index << ", "
+                  << weight_filepath << "\n";
+        assert(false && "data size mismatch");
+      }
+      // wq, wk, wo
+      if (file_index == 0) {
+        for (int i = 0; i < tensor_parallelism_degree; i++) {
+          for (int j = 0; j < one_partition_size; j++) {
+            ptr[base_index + i * stride_size + j] = host_array.at(data_index++);
+          }
+        }
+      } else {
+        for (int i = 0; i < num_heads; i++) {
+          int kv_idx = i / (num_heads / num_kv_heads);
+          int head_idx = i % (num_heads / tensor_parallelism_degree);
+          int tp_idx = (i / (num_heads / tensor_parallelism_degree));
+          for (int j = 0; j < single_proj_size; j++) {
+            ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx +
+                j] = host_array.at(kv_idx * single_proj_size + j);
+          }
+        }
+      }
+
+      // assert(data_index == partial_size);
+      base_index += one_partition_size;
+      file_index++;
+    }
+    assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
+                            tensor_parallelism_degree);
+  } else {
+    std::cout << "Loading weight file " << o_file << std::endl;
+    std::string weight_filepath = join_path({weights_folder, o_file});
+
+    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+    if (!in.good()) {
+      std::cout << "Could not open file: " << weight_filepath << std::endl;
+    }
+    assert(in.good() && "incorrect weight file path");
+    std::vector<DT> host_array(one_weight_file_size);
+    size_t loaded_data_size = sizeof(DT) * one_weight_file_size;
+    in.seekg(0, in.end);
+    in.seekg(0, in.beg);
+    in.read((char *)host_array.data(), loaded_data_size);
+    size_t in_get_size = in.gcount();
+
+    if (in_get_size != loaded_data_size) {
+      std::cout << "load data error" << std::endl;
+      assert(false);
+    }
+    assert(one_weight_file_size == host_array.size());
+    int data_index = 0;
+
+    int one_partition_size =
+        qkv_inner_dim * (num_heads / tensor_parallelism_degree);
+    for (int i = 0; i < one_weight_file_size; i++) {
+      int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
+      int block_num = (i / one_partition_size);
+      int offset = block_num / tensor_parallelism_degree * one_partition_size +
+                   (i % one_partition_size);
+      ptr[part_idx * stride_size + offset] = 
+          host_array.at(data_index++);
+    }
+
+    in.close();
+
+    assert(data_index == one_weight_file_size);
+  }
+}
+
 template <typename DT>
 void load_attention_weights_v2(DT *ptr,
                                int num_heads,
@@ -720,6 +907,21 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   DT *data = (DT *)malloc(sizeof(DT) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
+  bool is_attn_proj = false, is_o_proj = false;
+
+  if (weight_filename.find("_proj") != std::string::npos) {
+    size_t pos = weight_filename.find("_attn_o_proj");
+    if (pos != std::string::npos) {
+        weight_filename.replace(pos, std::string("_attn_o_proj").length(), "_attention");
+        is_o_proj = true;
+    } else {
+      pos = weight_filename.find("_attn_qkv_proj");
+      assert(pos != std::string::npos);
+      weight_filename.replace(pos, std::string("_attn_qkv_proj").length(), "_attention");
+    }
+    is_attn_proj = true;
+  }
+
 
   if (ff->config.benchmarking) {
     std::cout << "Initializing weight " << weight_filename
@@ -753,6 +955,52 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                weight_filename,
                                weights_folder);
       }
+    } else if(is_attn_proj) {
+      if(is_o_proj) {
+        if(weight_idx == 0) {
+          load_attention_weights_to_dense_v2(data,
+                                             num_heads,
+                                             num_kv_heads,
+                                             hidden_dim,
+                                             qkv_inner_dim,
+                                             weight_filename,
+                                             weights_folder,
+                                             volume,
+                                             tensor_parallelism_degree,
+                                             true);
+        } else {
+          load_attention_o_proj_bias_to_dense_v2(data,
+                                                 num_heads,
+                                                 num_kv_heads,
+                                                 hidden_dim,
+                                                 qkv_inner_dim,
+                                                 weight_filename,
+                                                 weights_folder);
+        
+        }
+      } else {
+        if(weight_idx == 0) {
+          load_attention_weights_to_dense_v2(data,
+                                             num_heads,
+                                             num_kv_heads,
+                                             hidden_dim,
+                                             qkv_inner_dim,
+                                             weight_filename,
+                                             weights_folder,
+                                             volume,
+                                             tensor_parallelism_degree,
+                                             false);
+        } else {
+          load_attention_bias_v2(data,
+                                 num_heads,
+                                 num_kv_heads,
+                                 hidden_dim,
+                                 qkv_inner_dim,
+                                 false, // do not load o_proj bias
+                                 weight_filename,
+                                 weights_folder);
+        }
+      }
     } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
       assert(weight_idx >= 0 || weight_idx <= 2);
       weight_filename += (weight_idx == 0)

From 94e156321c9a8c517e24feb09ea7137eb3171962 Mon Sep 17 00:00:00 2001
From: root <yingchen21@mails.tsinghua.edu.cn>
Date: Wed, 10 Jul 2024 05:31:46 +0000
Subject: [PATCH 198/198] fixed filename problem from renaming weight file

---
 inference/models/llama.cc  |  4 ++--
 src/runtime/file_loader.cc | 23 +++++++++++++----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index e4b2e5a537..d3319a8a5d 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -103,7 +103,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       nullptr, // ?
       REG_MODE_NONE, // no regularization
       0.0f, // no dropout
-      std::string("layers_" + std::to_string(i) + "_attn_qkv_proj")
+      std::string("layers." + std::to_string(i) + ".attn_qkv_proj")
                      .c_str()
     );
 
@@ -194,7 +194,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_attn_o_proj")
+                   std::string("layers." + std::to_string(i) + ".attn_o_proj")
                        .c_str());
 
     // step 2: SILU activaion
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index a66a6097ea..4ed31eb4dd 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -135,7 +135,7 @@ void load_attention_o_proj_bias_to_dense_v2(DT *ptr,
                             size_t qkv_inner_dim,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string filename = layer_name + "_wo_bias";
+  std::string filename = layer_name + ".o_proj.bias";
 
   int file_index = 0;
 
@@ -273,10 +273,10 @@ void load_attention_weights_to_dense_v2(DT *ptr,
                                bool load_o_proj) {
   // layers_0_attention_wq_weight
   // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
@@ -909,15 +909,18 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
   bool is_attn_proj = false, is_o_proj = false;
 
-  if (weight_filename.find("_proj") != std::string::npos) {
-    size_t pos = weight_filename.find("_attn_o_proj");
+  if (weight_filename.find("attn_") != std::string::npos) {
+    size_t pos = weight_filename.find(".attn_o_proj");
     if (pos != std::string::npos) {
-        weight_filename.replace(pos, std::string("_attn_o_proj").length(), "_attention");
+        weight_filename.replace(pos, std::string(".attn_o_proj").length(), ".self_attn");
         is_o_proj = true;
     } else {
-      pos = weight_filename.find("_attn_qkv_proj");
+      pos = weight_filename.find(".attn_qkv_proj");
+      if(pos == std::string::npos) {
+        cout<<weight_filename<<endl;
+      }
       assert(pos != std::string::npos);
-      weight_filename.replace(pos, std::string("_attn_qkv_proj").length(), "_attention");
+      weight_filename.replace(pos, std::string(".attn_qkv_proj").length(), ".self_attn");
     }
     is_attn_proj = true;
   }