flexflow · yingchen21 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 29, 2023
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -174,6 +174,10 @@ jobs:
           # Inference tests
           source ./build/set_python_envs.sh
           ./tests/inference_tests.sh
+
+          # PEFT tests
+          ./tests/peft_tests.sh
+          python ./tests/peft/peft_alignment_test.py
 
       - name: Save inference output as an artifact
         if: always()

diff --git a/.gitignore b/.gitignore
@@ -187,4 +187,9 @@ gpt_tokenizer
 python/flexflow/version.txt
 
 inference_tensors
+hf_peft_tensors
+lora_training_logs
+
+Untitled-1.ipynb
+Untitled-2.ipynb
 tests/inference/python_test_configs/*.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -558,6 +558,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 

diff --git a/conda/flexflow.yml b/conda/flexflow.yml
@@ -25,3 +25,10 @@ dependencies:
     - sentencepiece
     - einops
     - requests
+    - scipy
+    - bitsandbytes
+    - datasets
+    - accelerate
+    - loralib
+    - triton
+    - peft
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -93,6 +93,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
 RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
@@ -43,6 +44,8 @@ class BatchConfig {
   BatchConfig();
   int num_active_requests() const;
   int num_active_tokens() const;
+  int num_active_infr_tokens() const;
+  int num_active_peft_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
@@ -56,26 +59,41 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
+  static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
   //  Set by update
-  int num_tokens;
+
+  int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
   // number of tokens in prompt phase, start offset of tokens in inc_decoding
   // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-  int num_generation_tokens;
+  int num_generation_tokens = 0;
 
   struct PerRequestInfo {
+    PerRequestInfo() {
+      first_token_depth_in_request = 0;
+      first_token_offset_in_batch = 0;
+      num_tokens_in_batch = 0;
+      max_sequence_length = 0;
+      request_guid = 0;
+      prompt_phase = false;
+      batch_config_request_id = -1;
+      peft_model_id = PEFTModelID::NO_ID;
+      peft_bwd = false;
+    }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
     int max_sequence_length;
 
     // request id in batch config:
-    int batch_config_request_id;
+    int batch_config_request_id = -1;
     bool prompt_phase = false;
     RequestGuid request_guid;
+    // PEFT fields
+    PEFTModelID peft_model_id;
+    bool peft_bwd;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;
@@ -102,6 +120,7 @@ class BatchConfig {
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+  PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
   bool request_running[MAX_NUM_REQUESTS];

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -65,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 #endif
 
 class FFConfig;
+class MemoryAllocator;
+class PEFTWeightAllocator;
+
+struct CombinedBatchConfigMetaStruct {
+  BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS];
+  BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS];
+  BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS];
+  bool request_completed[BatchConfig::MAX_NUM_REQUESTS];
+
+  BeamSearchBatchConfig::BeamSearchPerTokenInfo
+      beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS +
+                    BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM *
+                        BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+  BeamSearchBatchConfig::BeamSearchPerRequestInfo
+      beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+
+  TreeVerifyBatchConfig::CommittedTokensInfo
+      committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS];
+};
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -76,18 +95,18 @@ struct FFHandler {
 #endif
   void *workSpace;
   size_t workSpaceSize;
-  void *batch_config_metadata;
+  CombinedBatchConfigMetaStruct *batch_config_metadata;
 
   // request info + token info + topolopgy mask info
-  size_t batch_config_metadata_size =
-      sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-      sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-      sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_completed);
+  size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
+  // PEFT related fields
+  MemoryAllocator *peft_activation_allocator;
+  size_t peft_activation_reserve_space_size;
+  PEFTWeightAllocator *peft_weight_allocator;
+  size_t peft_weight_reserve_space_size;
+  // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
@@ -98,6 +117,8 @@ struct FFHandler {
 struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -155,6 +176,10 @@ class FFConfig {
   bool cpu_offload;
   size_t offload_reserve_space_size;
   DataType quantization_type;
+  // PEFT related fields
+  bool enable_peft;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
@@ -46,6 +46,12 @@ enum LossType {
   LOSS_IDENTITY = 54,
 };
 
+enum OptimizerType {
+  OPTIMIZER_TYPE_NONE = 60,
+  OPTIMIZER_TYPE_SGD = 61,
+  OPTIMIZER_TYPE_ADAM = 62,
+};
+
 enum CompMode {
   COMP_MODE_TRAINING = 70,
   COMP_MODE_INFERENCE = 71,
@@ -72,6 +78,11 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
+enum RequestType {
+  REQ_INFERENCE = 4001,
+  REQ_FINETUNING = 4002,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
@@ -172,6 +183,8 @@ enum OperatorType {
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
+  // PEFT Ops
+  OP_LORA,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
@@ -268,5 +281,7 @@ enum {
   TENSOR_GUID_LAST_VALID = 3999999,
   PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
   NODE_GUID_FIRST_VALID = 5000000,
+  PEFT_MODEL_ID_FIRST_VALID = 6000000,
+  PEFT_MODEL_ID_LAST_VALID = 6999999
 };
 #endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
@@ -3,6 +3,8 @@
 
 #include "flexflow/ffconst.h"
 #include <cstddef>
+#include <functional>
+#include <iostream>
 
 namespace FlexFlow {
 
@@ -18,6 +20,29 @@ class LayerID {
   size_t id, transformer_layer_id, model_id;
 };
 
+class PEFTModelID {
+public:
+  static const PEFTModelID NO_ID;
+  PEFTModelID();
+  PEFTModelID(size_t id);
+  bool is_valid_id() const;
+  friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  PEFTModelID const &peft_model_id);
+
+public:
+  size_t id;
+};
+
 }; // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<FlexFlow::PEFTModelID> {
+  size_t operator()(FlexFlow::PEFTModelID const &n) const {
+    return n.id;
+  }
+};
+} // namespace std
+
 #endif // _FF_TYPE_H
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -55,6 +55,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
 FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -270,6 +272,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name);
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
@@ -281,6 +284,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name);
 
 flexflow_tensor_t
@@ -565,6 +569,7 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
@@ -590,6 +595,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
 
@@ -613,10 +621,13 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
-                             char const **input_text,
-                             int max_num_chars,
-                             char **output_text,
-                             int max_seq_length,
+                             enum RequestType *request_types,
+                             char const **input_texts,
+                             char **output_texts,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
                              int **output_length_and_tokens);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
@@ -978,6 +989,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num(
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -1036,6 +1050,28 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_);
 
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id();
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
@@ -49,7 +49,7 @@ class Layer {
   Tensor outputs[MAX_NUM_OUTPUTS];
   Tensor inputs[MAX_NUM_INPUTS];
   Tensor weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  // bool trainable_inputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
   bool inference_debugging;