flexflow · goliaro · Oct 8, 2023 · Oct 4, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/.gitignore b/.gitignore
@@ -185,3 +185,5 @@ gpt_tokenizer
 
 # pip version
 python/flexflow/version.txt
+
+inference_tensors
diff --git a/conda/pytorch-gpu.yml b/conda/pytorch-gpu.yml
@@ -3,7 +3,7 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - python>=3.6
+  - python>=3.6,<3.12
   - pip
   - pip:
     - numpy>=1.16.0

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -46,7 +46,9 @@ class BatchConfig {
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_sequence_length();
+  friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
+  void save_to_file(std::string const &filename) const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
   // Maximum possible values for different parameters
@@ -55,9 +57,8 @@ class BatchConfig {
   static int const MAX_NUM_REQUESTS = 64;
   static int const MAX_NUM_TOKENS = 1024;
 
-  //  These are set by update
+  //  Set by update
   int num_tokens;
-  bool loading_prompt = false;
 
   struct PerRequestInfo {
     int token_start_offset;
@@ -74,15 +75,18 @@ class BatchConfig {
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
-  bool request_running[MAX_NUM_TOKENS];
+  bool request_running[MAX_NUM_REQUESTS];
 };
 
 class TreeVerifyBatchConfig : public BatchConfig {
 public:
   TreeVerifyBatchConfig();
   ~TreeVerifyBatchConfig();
   InferenceMode get_mode() const;
+  friend std::ostream &operator<<(std::ostream &os,
+                                  TreeVerifyBatchConfig const &bc);
   void print() const;
+  void save_to_file(std::string const &filename) const;
   struct CommittedTokensInfo {
     int token_index;   // the index of the token in the previous batch
     int request_index; // request index in the batch
@@ -108,7 +112,10 @@ class BeamSearchBatchConfig : public BatchConfig {
 
   ~BeamSearchBatchConfig();
 
+  friend std::ostream &operator<<(std::ostream &os,
+                                  BeamSearchBatchConfig const &bc);
   void print() const;
+  void save_to_file(std::string const &filename) const;
   bool done() const;
   int max_beam_depth_all_requests() const;
   int current_depth_all_requests() const;

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -134,6 +134,7 @@ class FFConfig {
   Legion::Runtime *lg_hlr;
   // Legion::FieldSpace field_space;
   bool syntheticInput, profiling, perform_fusion;
+  bool inference_debugging;
   size_t simulator_work_space_size;
   size_t search_budget;
   float search_alpha;

diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
@@ -10,12 +10,12 @@ class LayerID {
 public:
   static const LayerID NO_ID;
   LayerID();
-  LayerID(size_t id, size_t transformer_layer_id);
+  LayerID(size_t id, size_t transformer_layer_id, size_t model_id);
   bool is_valid_id() const;
   friend bool operator==(LayerID const &lhs, LayerID const &rhs);
 
 public:
-  size_t id, transformer_layer_id;
+  size_t id, transformer_layer_id, model_id;
 };
 
 }; // namespace FlexFlow

diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
@@ -52,6 +52,7 @@ class Layer {
   bool trainableInputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
+  bool inference_debugging;
 
 private:
   std::unordered_map<std::string, long long> int_properties;

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -1234,6 +1234,8 @@ class FFModel {
   std::unordered_map<size_t, NoOp *> cached_noop_ops;
   std::unordered_map<size_t, NoOp *> cached_input_ops;
   std::vector<MachineView> all_valid_views;
+  int model_id; // unique incremental id assigned to each model. Used in the
+                // inference_debugging mode.
 #ifdef FF_USE_NCCL
   std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms;
 #endif
@@ -1262,6 +1264,9 @@ class FFModel {
   ElementUnary *
       unary(OperatorType op, char const *name = NULL, float scalar = 0.0);
   PCG::Node new_node(Op *);
+  static int model_counter; // number of instantiated FFModel objects. Used to
+                            // assign a unique incremental id to each model.
+                            // Used in the inference_debugging mode.
 };
 
 class UtilityTasks {

diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
@@ -15,6 +15,10 @@ class OpMeta {
 public:
   FFHandler handle;
   bool profiling; // Measure the run time of the task
+  bool inference_debugging;
+  int decoding_step;
+  char op_name[MAX_OPNAME];
+  LayerID layer_guid;
   bool trainableInputs[MAX_NUM_INPUTS];
   DataType input_type[MAX_NUM_INPUTS];
   DataType weight_type[MAX_NUM_WEIGHTS];

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
@@ -1,6 +1,7 @@
 #ifndef _OPERATOR_H
 #define _OPERATOR_H
 
+#include "flexflow/accessor.h"
 #include "flexflow/batch_config.h"
 #include "flexflow/fftype.h"
 #include "flexflow/machine_view.h"
@@ -183,6 +184,7 @@ class Op {
      const ParallelTensor input4 = NULL);
   Op(int guid,
      bool profiling,
+     bool inference_debugging,
      OperatorType otype,
      DataType dtype,
      char const *name,
@@ -225,6 +227,13 @@ class Op {
     assert(false);
   };
   virtual void print_layer(FFModel const &model) = 0;
+  static void save_inference_tensors_to_file(
+      OpMeta *m,
+      int shard_id,
+      BatchConfig const *bc,
+      std::vector<GenericTensorAccessorR> input_tensors,
+      std::vector<GenericTensorAccessorR> weight_tensors,
+      std::vector<GenericTensorAccessorW> output_tensors);
   virtual bool measure_operator_cost(Simulator *sim,
                                      MachineView const &mv,
                                      CostMetrics &cost_metrics) const = 0;
@@ -316,6 +325,7 @@ class Op {
   std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
   int numInputs, numWeights, numOutputs;
   bool profiling;
+  bool inference_debugging;
   bool add_bias_only_once;
 #ifdef FF_USE_NCCL
   ncclUniqueId ncclId;

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -106,7 +106,6 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
   int64_t effective_batch_size, effective_num_elements;
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h
@@ -26,7 +26,6 @@ class ElementUnaryMeta : public OpMeta {
   DataType data_type;
   bool inplace;
   float scalar;
-  char op_name[MAX_OPNAME];
 };
 
 class ElementUnary : public Op {

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -185,9 +185,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   bool *qk_prod_scaling;
   bool *position_bias;
   float scaling_factor;
-#ifdef INFERENCE_TESTS
-  float *kcache, *vcache;
-#endif
   void *weight_ptr, *bias_ptr; // for weight offload
   void *devQKVProjArray, *keyCache, *valueCache;
   void *qk_prods, *qk_prods_softmax;

diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h
@@ -12,7 +12,6 @@ class ConcatMeta : public OpMeta {
 public:
   ConcatMeta(FFHandler handle) : OpMeta(handle){};
   int legion_axis;
-  char op_name[MAX_OPNAME];
 };
 
 namespace Kernels {

diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h
@@ -28,7 +28,6 @@ class Conv2DMeta : public OpMeta {
   miopenConvBwdDataAlgorithm_t bwdDataAlgo;
 #endif
   bool relu, use_bias;
-  char op_name[MAX_OPNAME];
 };
 
 namespace Kernels {

diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h
@@ -23,7 +23,6 @@ class ElementBinaryMeta : public OpMeta {
   OperatorType op_type;
   bool inplace_a, has_same_operands;
   bool broadcast_input1, broadcast_input2;
-  char op_name[MAX_OPNAME];
 };
 
 namespace Kernels {

diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
@@ -34,7 +34,6 @@ class LinearMeta : public OpMeta {
   RegularizerMode kernel_reg_type;
   float kernel_reg_lambda;
   bool use_bias, add_bias_only_once;
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h
@@ -14,7 +14,6 @@ class Pool2DMeta : public OpMeta {
   ffActivationDescriptor_t actiDesc;
   ffPoolingDescriptor_t poolDesc;
   bool relu;
-  char op_name[MAX_OPNAME];
 };
 
 namespace Kernels {

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -37,7 +37,6 @@ class ResidualRMSNormMeta : public OpMeta {
   int in_dim;
   int batch_size;
   int num_elements;
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -37,7 +37,6 @@ class RMSNormMeta : public OpMeta {
   int in_dim;
   int batch_size;
   int num_elements;
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -21,8 +21,8 @@ class SoftmaxMeta : public OpMeta {
   miopenTensorDescriptor_t outputTensor;
 #endif
   bool profiling;
+  bool inference_debugging;
   int dim;
-  char op_name[MAX_OPNAME];
   DataType input_type, output_type;
 };
 

diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h
@@ -123,7 +123,6 @@ class LayerNormMeta : public OpMeta {
   int64_t effective_batch_size, effective_num_elements;
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h
@@ -96,6 +96,7 @@ class Linear : public Op {
 private:
   Linear(int guid,
          bool profiling,
+         bool inference_debugging,
          const ParallelTensor input,
          int out_dim,
          ActiMode activation,

diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
@@ -106,7 +106,6 @@ class ResidualLayerNormMeta : public OpMeta {
   int64_t effective_batch_size, effective_num_elements;
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
@@ -80,7 +80,6 @@ class ResidualRMSNorm : public Op {
 
 public:
   float eps;
-  char op_name[MAX_OPNAME];
   int effective_batch_size;
   int dim, data_dim;
 };

diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h
@@ -79,7 +79,6 @@ class RMSNorm : public Op {
 
 public:
   float eps;
-  char op_name[MAX_OPNAME];
   int effective_batch_size;
   int dim, data_dim;
 };

diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -79,7 +79,6 @@ class SigmoidSiluMultiMeta : public OpMeta {
   ~SigmoidSiluMultiMeta(void);
 
 public:
-  char op_name[MAX_OPNAME];
   Realm::RegionInstance reserveInst;
 };
 

diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h
@@ -137,6 +137,9 @@ __host__ void updateGAS(float *para_ptr,
 template <typename T>
 void print_tensor(T const *ptr, size_t num_elements, char const *prefix);
 
+template <typename T>
+void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
+
 template <typename T>
 T *download_tensor(T const *ptr, size_t num_elements);
 

diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
@@ -55,6 +55,7 @@ def get_configs():
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "profiling": False,
+            "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -55,6 +55,7 @@ def get_configs():
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "profiling": False,
+            "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {

diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
@@ -41,6 +41,7 @@
     "num_cpus": "-ll:cpu",
     "legion_utility_processors": "-ll:util",
     "profiling": "--profiling",
+    "inference_debugging": "--inference-debugging",
     "fusion": "--fusion",
     "disable_control_replication": "--disable-control-replication",
     # Training args

diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
@@ -45,6 +45,7 @@ def init(
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
     profiling: Optional[bool] = None,
+    inference_debugging: Optional[bool] = None,
     fusion: Optional[bool] = None,
 ):
     """
@@ -71,6 +72,7 @@ def init(
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
+    - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
 
     The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments.
@@ -104,6 +106,8 @@ def init(
     :type use_8bit_quantization: Optional[bool], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
+    :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
+    :type inference_debugging: Optional[bool], optional
     :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
     :type fusion: Optional[bool], optional
 
@@ -128,6 +132,7 @@ def init(
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
             profiling is not None,
+            inference_debugging is not None,
             fusion is not None,
         ]
     ):
@@ -152,6 +157,7 @@ def init(
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
             "profiling": profiling,
+            "inference_debugging": inference_debugging,
             "fusion": fusion,
         }
 
@@ -195,6 +201,8 @@ def init(
         configs_dict["use_8bit_quantization"] = False
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
+    if configs_dict.get("inference_debugging", None) is None:
+        configs_dict["inference_debugging"] = False
     if configs_dict.get("fusion", None) is None:
         configs_dict["fusion"] = True
Original file line number	Diff line number	Diff line change
Expand Up		@@ -185,3 +185,5 @@ gpt_tokenizer

		# pip version
		python/flexflow/version.txt

		inference_tensors
-Original file line number
+Diff line change
@@ Expand Up / @@ -79,7 +79,6 @@ class SigmoidSiluMultiMeta : public OpMeta { @@
       ~SigmoidSiluMultiMeta(void);
     public:
-      char op_name[MAX_OPNAME];
       Realm::RegionInstance reserveInst;
     };
@@ Expand Down @@