Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tool] - Add mechanism to save operators' tensors to file #1174

Merged
merged 16 commits into from
Oct 8, 2023
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,5 @@ gpt_tokenizer

# pip version
python/flexflow/version.txt

inference_tensors
2 changes: 1 addition & 1 deletion conda/pytorch-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- defaults
- conda-forge
dependencies:
- python>=3.6
- python>=3.6,<3.12
- pip
- pip:
- numpy>=1.16.0
Expand Down
13 changes: 10 additions & 3 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ class BatchConfig {
static int max_requests_per_batch();
static int max_tokens_per_batch();
static int max_sequence_length();
friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
virtual InferenceMode get_mode() const;
static BatchConfig const *from_future(BatchConfigFuture const &future);
// Maximum possible values for different parameters
Expand All @@ -55,9 +57,8 @@ class BatchConfig {
static int const MAX_NUM_REQUESTS = 64;
static int const MAX_NUM_TOKENS = 1024;

// These are set by update
// Set by update
int num_tokens;
bool loading_prompt = false;

struct PerRequestInfo {
int token_start_offset;
Expand All @@ -74,15 +75,18 @@ class BatchConfig {
PerTokenInfo tokensInfo[MAX_NUM_TOKENS];

bool request_completed[MAX_NUM_REQUESTS];
bool request_running[MAX_NUM_TOKENS];
bool request_running[MAX_NUM_REQUESTS];
};

class TreeVerifyBatchConfig : public BatchConfig {
public:
TreeVerifyBatchConfig();
~TreeVerifyBatchConfig();
InferenceMode get_mode() const;
friend std::ostream &operator<<(std::ostream &os,
TreeVerifyBatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
struct CommittedTokensInfo {
int token_index; // the index of the token in the previous batch
int request_index; // request index in the batch
Expand All @@ -108,7 +112,10 @@ class BeamSearchBatchConfig : public BatchConfig {

~BeamSearchBatchConfig();

friend std::ostream &operator<<(std::ostream &os,
BeamSearchBatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
bool done() const;
int max_beam_depth_all_requests() const;
int current_depth_all_requests() const;
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class FFConfig {
Legion::Runtime *lg_hlr;
// Legion::FieldSpace field_space;
bool syntheticInput, profiling, perform_fusion;
bool inference_debugging;
size_t simulator_work_space_size;
size_t search_budget;
float search_alpha;
Expand Down
4 changes: 2 additions & 2 deletions include/flexflow/fftype.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ class LayerID {
public:
static const LayerID NO_ID;
LayerID();
LayerID(size_t id, size_t transformer_layer_id);
LayerID(size_t id, size_t transformer_layer_id, size_t model_id);
bool is_valid_id() const;
friend bool operator==(LayerID const &lhs, LayerID const &rhs);

public:
size_t id, transformer_layer_id;
size_t id, transformer_layer_id, model_id;
};

}; // namespace FlexFlow
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class Layer {
bool trainableInputs[MAX_NUM_INPUTS];
int numInputs, numWeights, numOutputs;
bool profiling;
bool inference_debugging;

private:
std::unordered_map<std::string, long long> int_properties;
Expand Down
5 changes: 5 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,8 @@ class FFModel {
std::unordered_map<size_t, NoOp *> cached_noop_ops;
std::unordered_map<size_t, NoOp *> cached_input_ops;
std::vector<MachineView> all_valid_views;
int model_id; // unique incremental id assigned to each model. Used in the
// inference_debugging mode.
#ifdef FF_USE_NCCL
std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms;
#endif
Expand Down Expand Up @@ -1262,6 +1264,9 @@ class FFModel {
ElementUnary *
unary(OperatorType op, char const *name = NULL, float scalar = 0.0);
PCG::Node new_node(Op *);
static int model_counter; // number of instantiated FFModel objects. Used to
// assign a unique incremental id to each model.
// Used in the inference_debugging mode.
};

class UtilityTasks {
Expand Down
4 changes: 4 additions & 0 deletions include/flexflow/op_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class OpMeta {
public:
FFHandler handle;
bool profiling; // Measure the run time of the task
bool inference_debugging;
int decoding_step;
char op_name[MAX_OPNAME];
LayerID layer_guid;
bool trainableInputs[MAX_NUM_INPUTS];
DataType input_type[MAX_NUM_INPUTS];
DataType weight_type[MAX_NUM_WEIGHTS];
Expand Down
10 changes: 10 additions & 0 deletions include/flexflow/operator.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _OPERATOR_H
#define _OPERATOR_H

#include "flexflow/accessor.h"
#include "flexflow/batch_config.h"
#include "flexflow/fftype.h"
#include "flexflow/machine_view.h"
Expand Down Expand Up @@ -183,6 +184,7 @@ class Op {
const ParallelTensor input4 = NULL);
Op(int guid,
bool profiling,
bool inference_debugging,
OperatorType otype,
DataType dtype,
char const *name,
Expand Down Expand Up @@ -225,6 +227,13 @@ class Op {
assert(false);
};
virtual void print_layer(FFModel const &model) = 0;
static void save_inference_tensors_to_file(
OpMeta *m,
int shard_id,
BatchConfig const *bc,
std::vector<GenericTensorAccessorR> input_tensors,
std::vector<GenericTensorAccessorR> weight_tensors,
std::vector<GenericTensorAccessorW> output_tensors);
virtual bool measure_operator_cost(Simulator *sim,
MachineView const &mv,
CostMetrics &cost_metrics) const = 0;
Expand Down Expand Up @@ -316,6 +325,7 @@ class Op {
std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
int numInputs, numWeights, numOutputs;
bool profiling;
bool inference_debugging;
bool add_bias_only_once;
#ifdef FF_USE_NCCL
ncclUniqueId ncclId;
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/add_bias_residual_layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/element_unary.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class ElementUnaryMeta : public OpMeta {
DataType data_type;
bool inplace;
float scalar;
char op_name[MAX_OPNAME];
};

class ElementUnary : public Op {
Expand Down
3 changes: 0 additions & 3 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
bool *qk_prod_scaling;
bool *position_bias;
float scaling_factor;
#ifdef INFERENCE_TESTS
float *kcache, *vcache;
#endif
void *weight_ptr, *bias_ptr; // for weight offload
void *devQKVProjArray, *keyCache, *valueCache;
void *qk_prods, *qk_prods_softmax;
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/concat_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class ConcatMeta : public OpMeta {
public:
ConcatMeta(FFHandler handle) : OpMeta(handle){};
int legion_axis;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/conv_2d_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ class Conv2DMeta : public OpMeta {
miopenConvBwdDataAlgorithm_t bwdDataAlgo;
#endif
bool relu, use_bias;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/element_binary_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class ElementBinaryMeta : public OpMeta {
OperatorType op_type;
bool inplace_a, has_same_operands;
bool broadcast_input1, broadcast_input2;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class LinearMeta : public OpMeta {
RegularizerMode kernel_reg_type;
float kernel_reg_lambda;
bool use_bias, add_bias_only_once;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/pool_2d_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class Pool2DMeta : public OpMeta {
ffActivationDescriptor_t actiDesc;
ffPoolingDescriptor_t poolDesc;
bool relu;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/residual_rms_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class ResidualRMSNormMeta : public OpMeta {
int in_dim;
int batch_size;
int num_elements;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/rms_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class RMSNormMeta : public OpMeta {
int in_dim;
int batch_size;
int num_elements;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/ops/kernels/softmax_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class SoftmaxMeta : public OpMeta {
miopenTensorDescriptor_t outputTensor;
#endif
bool profiling;
bool inference_debugging;
int dim;
char op_name[MAX_OPNAME];
DataType input_type, output_type;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ class LayerNormMeta : public OpMeta {
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Linear : public Op {
private:
Linear(int guid,
bool profiling,
bool inference_debugging,
const ParallelTensor input,
int out_dim,
ActiMode activation,
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/residual_layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ class ResidualLayerNormMeta : public OpMeta {
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/residual_rms_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ class ResidualRMSNorm : public Op {

public:
float eps;
char op_name[MAX_OPNAME];
int effective_batch_size;
int dim, data_dim;
};
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/rms_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ class RMSNorm : public Op {

public:
float eps;
char op_name[MAX_OPNAME];
int effective_batch_size;
int dim, data_dim;
};
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/sigmoid_silu_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ class SigmoidSiluMultiMeta : public OpMeta {
~SigmoidSiluMultiMeta(void);

public:
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/utils/hip_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ __host__ void updateGAS(float *para_ptr,
template <typename T>
void print_tensor(T const *ptr, size_t num_elements, char const *prefix);

template <typename T>
void save_tensor(T const *ptr, size_t num_elements, char const *file_name);

template <typename T>
T *download_tensor(T const *ptr, size_t num_elements);

Expand Down
1 change: 1 addition & 0 deletions inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def get_configs():
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"profiling": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
Expand Down
1 change: 1 addition & 0 deletions inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def get_configs():
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"profiling": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
Expand Down
1 change: 1 addition & 0 deletions python/flexflow/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"num_cpus": "-ll:cpu",
"legion_utility_processors": "-ll:util",
"profiling": "--profiling",
"inference_debugging": "--inference-debugging",
"fusion": "--fusion",
"disable_control_replication": "--disable-control-replication",
# Training args
Expand Down
8 changes: 8 additions & 0 deletions python/flexflow/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def init(
use_4bit_quantization: Optional[bool] = None,
use_8bit_quantization: Optional[bool] = None,
profiling: Optional[bool] = None,
inference_debugging: Optional[bool] = None,
fusion: Optional[bool] = None,
):
"""
Expand All @@ -71,6 +72,7 @@ def init(
- use_4bit_quantization: whether to use 4-bit quantization, defaults to False
- use_8bit_quantization: whether to use 8-bit quantization, defaults to False
- profiling: whether to enable the FlexFlow profiling mode, defaults to False
- inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
- fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True

The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments.
Expand Down Expand Up @@ -104,6 +106,8 @@ def init(
:type use_8bit_quantization: Optional[bool], optional
:param profiling: whether to enable the FlexFlow profiling mode, defaults to False
:type profiling: Optional[bool], optional
:param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
:type inference_debugging: Optional[bool], optional
:param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
:type fusion: Optional[bool], optional

Expand All @@ -128,6 +132,7 @@ def init(
use_4bit_quantization is not None,
use_8bit_quantization is not None,
profiling is not None,
inference_debugging is not None,
fusion is not None,
]
):
Expand All @@ -152,6 +157,7 @@ def init(
"use_4bit_quantization": use_4bit_quantization,
"use_8bit_quantization": use_8bit_quantization,
"profiling": profiling,
"inference_debugging": inference_debugging,
"fusion": fusion,
}

Expand Down Expand Up @@ -195,6 +201,8 @@ def init(
configs_dict["use_8bit_quantization"] = False
if configs_dict.get("profiling", None) is None:
configs_dict["profiling"] = False
if configs_dict.get("inference_debugging", None) is None:
configs_dict["inference_debugging"] = False
if configs_dict.get("fusion", None) is None:
configs_dict["fusion"] = True

Expand Down
Loading